//Adapted from CUDA to OpenCL by Erwin Coumans //See http://bitbucket.org/erwincoumans/opencl_course // Copyright 2012 NVIDIA Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "b3OpenCLUtils.h" #include "../parallel_primitives/host/b3OpenCLArray.h" #include "../parallel_primitives/host/b3LauncherCL.h" #include "Bullet3Common/b3Quickprof.h" #include "../parallel_primitives/host/b3FillCL.h" #include "Bullet3Common/b3CommandLineArgs.h" #include #include #include //make sure to update the same #define in the opencl/lds_bank_conflict/lds_kernels.cl const int TILE_DIM = 32; const int BLOCK_ROWS = 8; const int NUM_REPS = 100; // Check errors and print GB/s void postprocess(const float *ref, const float *res, int n, float ms) { bool passed = true; for (int i = 0; i < n; i++) if (res[i] != ref[i]) { printf("\nError: at res[%d] got %f but expected %f\n", i, res[i], ref[i]); printf("%25s\n", "*** FAILED ***"); passed = false; break; } if (passed) printf("%20.2f\n", 2 * n * sizeof(float) * 1e-6 * NUM_REPS / ms ); } char* loadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) { // locals FILE* pFileStream = NULL; size_t szSourceLength; // open the OpenCL source code file pFileStream = fopen(cFilename, "rb"); if(pFileStream == 0) { return NULL; } size_t szPreambleLength = strlen(cPreamble); // get the length of the source code fseek(pFileStream, 0, SEEK_END); szSourceLength = ftell(pFileStream); fseek(pFileStream, 0, SEEK_SET); // allocate a buffer for the source code string and read it in char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); memcpy(cSourceString, cPreamble, szPreambleLength); fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream); // close the file and return the total length of the combined (preamble + source) string fclose(pFileStream); if(szFinalLength != 0) { *szFinalLength = szSourceLength + szPreambleLength; } cSourceString[szSourceLength + szPreambleLength] = '\0'; return cSourceString; } int main(int argc, char **argv) { printf("Use --deviceId= or --platformId= to override OpenCL device\n"); b3CommandLineArgs args(argc,argv); const int nx = 1024; const int ny = 1024; const int mem_size = nx*ny*sizeof(float); const int num_elements = nx*ny; b3Clock clock; double startEvent=0.f; double stopEvent=0.f; int localSizeX = TILE_DIM; int localSizeY = BLOCK_ROWS; int numThreadsX = (nx/TILE_DIM)*TILE_DIM; int numThreadsY = (ny/TILE_DIM)*BLOCK_ROWS; int gridX = numThreadsX / localSizeX; int gridY = numThreadsY / localSizeY; int ciErrNum = 0; int preferred_device = -1; int preferred_platform = -1; args.GetCmdLineArgument("deviceId",preferred_device); args.GetCmdLineArgument("platformId",preferred_platform); cl_platform_id platformId=0; cl_context ctx=0; cl_command_queue queue=0; cl_device_id device=0; cl_kernel copyKernel=0; cl_kernel copySharedMemKernel=0; cl_kernel transposeNaiveKernel = 0; cl_kernel transposeCoalescedKernel = 0; cl_kernel transposeNoBankConflictsKernel= 0; ctx = b3OpenCLUtils::createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum,0,0,preferred_device,preferred_platform,&platformId); b3OpenCLUtils::printPlatformInfo(platformId); oclCHECKERROR(ciErrNum, CL_SUCCESS); device = b3OpenCLUtils::getDevice(ctx,0); b3OpenCLUtils::printDeviceInfo(device); queue = clCreateCommandQueue(ctx, device, 0, &ciErrNum); const char* cSourceFile = "opencl/lds_bank_conflict/lds_kernels.cl"; size_t szKernelLength; const char* cSourceCL =0; char relativeFileName[1024]; { const char* prefix[]={"./","../","../../","../../../","../../../../"}; int numPrefixes = sizeof(prefix)/sizeof(char*); for (int i=0;!cSourceCL && i d_idataCL(ctx,queue);d_idataCL.resize(num_elements); b3OpenCLArray d_cdataCL(ctx,queue);d_cdataCL.resize(num_elements); b3OpenCLArray d_tdataCL(ctx,queue);d_tdataCL.resize(num_elements); // check parameters and calculate execution configuration if (nx % TILE_DIM || ny % TILE_DIM) { printf("nx and ny must be a multiple of TILE_DIM\n"); goto error_exit; } if (TILE_DIM % BLOCK_ROWS) { printf("TILE_DIM must be a multiple of BLOCK_ROWS\n"); goto error_exit; } // host for (int j = 0; j < ny; j++) for (int i = 0; i < nx; i++) h_idata[j*nx + i] = j*nx + i; // correct result for error checking for (int j = 0; j < ny; j++) for (int i = 0; i < nx; i++) { gold[j*nx + i] = h_idata[i*nx + j]; } d_idataCL.copyFromHostPointer(h_idata,num_elements); // events for timing clock.reset(); float ms; // ------------ // time kernels // ------------ printf("%25s%25s\n", "Routine", "Bandwidth (GB/s)"); // ---- // copy // ---- printf("%25s", "copy"); clMemSet.execute(d_cdataCL,0.f,num_elements); { // warm up b3LauncherCL launcher( queue, copyKernel); launcher.setBuffer( d_cdataCL.getBufferCL()); launcher.setBuffer( d_idataCL.getBufferCL()); launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); startEvent = clock.getTimeMicroseconds()/1e3; for (int i = 0; i < NUM_REPS; i++) launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); oclCHECKERROR(ciErrNum, CL_SUCCESS); clFinish(queue); stopEvent = clock.getTimeMicroseconds()/1e3; } ms = float(stopEvent-startEvent); d_cdataCL.copyToHostPointer(h_cdata,num_elements,0); postprocess(h_idata, h_cdata, nx*ny, ms); // ------------- // copySharedMem // ------------- printf("%25s", "shared memory copy"); clMemSet.execute(d_cdataCL,0.f,num_elements); { b3LauncherCL launcher( queue, copySharedMemKernel); launcher.setBuffer( d_cdataCL.getBufferCL()); launcher.setBuffer( d_idataCL.getBufferCL()); launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); startEvent = clock.getTimeMicroseconds()/1e3; for (int i = 0; i < NUM_REPS; i++) launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); oclCHECKERROR(ciErrNum, CL_SUCCESS); clFinish(queue); stopEvent = clock.getTimeMicroseconds()/1e3; } ms = float(stopEvent-startEvent); d_cdataCL.copyToHostPointer(h_cdata,num_elements,0); postprocess(h_idata, h_cdata, nx * ny, ms); // -------------- // transposeNaive // -------------- printf("%25s", "naive transpose"); clMemSet.execute(d_tdataCL,0.f,num_elements); { // warmup b3LauncherCL launcher( queue, transposeNaiveKernel); launcher.setBuffer( d_tdataCL.getBufferCL()); launcher.setBuffer( d_idataCL.getBufferCL()); launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); startEvent = clock.getTimeMicroseconds()/1e3; for (int i = 0; i < NUM_REPS; i++) launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); oclCHECKERROR(ciErrNum, CL_SUCCESS); clFinish(queue); stopEvent = clock.getTimeMicroseconds()/1e3; } ms = float(stopEvent-startEvent); d_tdataCL.copyToHostPointer(h_tdata,num_elements,0); postprocess(gold, h_tdata, nx * ny, ms); // ------------------ // transposeCoalesced // ------------------ printf("%25s", "coalesced transpose"); clMemSet.execute(d_tdataCL,0.f,num_elements); { b3LauncherCL launcher( queue, transposeCoalescedKernel); launcher.setBuffer( d_tdataCL.getBufferCL()); launcher.setBuffer( d_idataCL.getBufferCL()); launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); startEvent = clock.getTimeMicroseconds()/1e3; for (int i = 0; i < NUM_REPS; i++) launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); oclCHECKERROR(ciErrNum, CL_SUCCESS); clFinish(queue); stopEvent = clock.getTimeMicroseconds()/1e3; } ms = float(stopEvent-startEvent); d_tdataCL.copyToHostPointer(h_tdata,num_elements,0); postprocess(gold, h_tdata, nx * ny, ms); // ------------------------ // transposeNoBankConflicts // ------------------------ printf("%25s", "conflict-free transpose"); clMemSet.execute(d_tdataCL,0.f,num_elements); { b3LauncherCL launcher( queue, transposeNoBankConflictsKernel); launcher.setBuffer( d_tdataCL.getBufferCL()); launcher.setBuffer( d_idataCL.getBufferCL()); launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); startEvent = clock.getTimeMicroseconds()/1e3; for (int i = 0; i < NUM_REPS; i++) launcher.launch2D(numThreadsX,numThreadsY,localSizeX,localSizeY ); oclCHECKERROR(ciErrNum, CL_SUCCESS); clFinish(queue); stopEvent = clock.getTimeMicroseconds()/1e3; } ms = float(stopEvent-startEvent); d_tdataCL.copyToHostPointer(h_tdata,num_elements,0); postprocess(gold, h_tdata, nx * ny, ms); error_exit: // cleanup clReleaseKernel(copyKernel); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(h_idata); free(h_tdata); free(h_cdata); free(gold); printf("Press \n"); getchar(); }