#include "btRadixSort32CL.h" #include "btLauncherCL.h" #include "../../basic_initialize/b3OpenCLUtils.h" #include "btPrefixScanCL.h" #include "btFillCL.h" #define RADIXSORT32_PATH "opencl/parallel_primitives/kernels/RadixSort32Kernels.cl" #include "../kernels/RadixSort32KernelsCL.h" btRadixSort32CL::btRadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity) :m_commandQueue(queue) { btOpenCLDeviceInfo info; b3OpenCLUtils::getDeviceInfo(device,&info); m_deviceCPU = (info.m_deviceType & CL_DEVICE_TYPE_CPU)!=0; m_workBuffer1 = new btOpenCLArray(ctx,queue); m_workBuffer2 = new btOpenCLArray(ctx,queue); m_workBuffer3 = new btOpenCLArray(ctx,queue); m_workBuffer3a = new btOpenCLArray(ctx,queue); m_workBuffer4 = new btOpenCLArray(ctx,queue); m_workBuffer4a = new btOpenCLArray(ctx,queue); if (initialCapacity>0) { m_workBuffer1->resize(initialCapacity); m_workBuffer3->resize(initialCapacity); m_workBuffer3a->resize(initialCapacity); m_workBuffer4->resize(initialCapacity); m_workBuffer4a->resize(initialCapacity); } m_scan = new btPrefixScanCL(ctx,device,queue); m_fill = new btFillCL(ctx,device,queue); const char* additionalMacros = ""; const char* srcFileNameForCaching=""; cl_int pErrNum; const char* kernelSource = radixSort32KernelsCL; cl_program sortProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, RADIXSORT32_PATH); btAssert(sortProg); m_streamCountSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountSortDataKernel", &pErrNum, sortProg,additionalMacros ); btAssert(m_streamCountSortDataKernel ); m_streamCountKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "StreamCountKernel", &pErrNum, sortProg,additionalMacros ); btAssert(m_streamCountKernel); if (m_deviceCPU) { m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernelSerial", &pErrNum, sortProg,additionalMacros ); btAssert(m_sortAndScatterSortDataKernel); m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernelSerial", &pErrNum, sortProg,additionalMacros ); btAssert(m_sortAndScatterKernel); } else { m_sortAndScatterSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterSortDataKernel", &pErrNum, sortProg,additionalMacros ); btAssert(m_sortAndScatterSortDataKernel); m_sortAndScatterKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SortAndScatterKernel", &pErrNum, sortProg,additionalMacros ); btAssert(m_sortAndScatterKernel); } m_prefixScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "PrefixScanKernel", &pErrNum, sortProg,additionalMacros ); btAssert(m_prefixScanKernel); } btRadixSort32CL::~btRadixSort32CL() { delete m_scan; delete m_fill; delete m_workBuffer1; delete m_workBuffer2; delete m_workBuffer3; delete m_workBuffer3a; delete m_workBuffer4; delete m_workBuffer4a; clReleaseKernel(m_streamCountSortDataKernel); clReleaseKernel(m_streamCountKernel); clReleaseKernel(m_sortAndScatterSortDataKernel); clReleaseKernel(m_sortAndScatterKernel); clReleaseKernel(m_prefixScanKernel); } void btRadixSort32CL::executeHost(btAlignedObjectArray& inout, int sortBits /* = 32 */) { int n = inout.size(); const int BITS_PER_PASS = 8; const int NUM_TABLES = (1< workbuffer; workbuffer.resize(inout.size()); btSortData* dst = &workbuffer[0]; int count=0; for(int startBit=0; startBit> startBit) & (NUM_TABLES-1); tables[tableIdx]++; } //#define TEST #ifdef TEST printf("histogram size=%d\n",NUM_TABLES); for (int i=0;i> startBit) & (NUM_TABLES-1); dst[tables[tableIdx] + counter[tableIdx]] = src[i]; counter[tableIdx] ++; } btSwap( src, dst ); count++; } if (count&1) { btAssert(0);//need to copy } } void btRadixSort32CL::executeHost(btOpenCLArray& keyValuesInOut, int sortBits /* = 32 */) { btAlignedObjectArray inout; keyValuesInOut.copyToHost(inout); executeHost(inout,sortBits); keyValuesInOut.copyFromHost(inout); } void btRadixSort32CL::execute(btOpenCLArray& keysIn, btOpenCLArray& keysOut, btOpenCLArray& valuesIn, btOpenCLArray& valuesOut, int n, int sortBits) { } //#define DEBUG_RADIXSORT //#define DEBUG_RADIXSORT2 void btRadixSort32CL::execute(btOpenCLArray& keyValuesInOut, int sortBits /* = 32 */) { int originalSize = keyValuesInOut.size(); int workingSize = originalSize; int dataAlignment = DATA_ALIGNMENT; #ifdef DEBUG_RADIXSORT2 btAlignedObjectArray test2; keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i* src = 0; if (workingSize%dataAlignment) { workingSize += dataAlignment-(workingSize%dataAlignment); m_workBuffer4->copyFromOpenCLArray(keyValuesInOut); m_workBuffer4->resize(workingSize); btSortData fillValue; fillValue.m_key = 0xffffffff; fillValue.m_value = 0xffffffff; #define USE_BTFILL #ifdef USE_BTFILL m_fill->execute((btOpenCLArray&)*m_workBuffer4,(btInt2&)fillValue,workingSize-originalSize,originalSize); #else //fill the remaining bits (very slow way, todo: fill on GPU/OpenCL side) for (int i=originalSize; icopyFromHostPointer(&fillValue,1,i); } #endif//USE_BTFILL src = m_workBuffer4; } else { src = &keyValuesInOut; m_workBuffer4->resize(0); } btAssert( workingSize%DATA_ALIGNMENT == 0 ); int minCap = NUM_BUCKET*NUM_WGS; int n = workingSize; m_workBuffer1->resize(minCap); m_workBuffer3->resize(workingSize); // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); btAssert( BITS_PER_PASS == 4 ); btAssert( WG_SIZE == 64 ); btAssert( (sortBits&0x3) == 0 ); btOpenCLArray* dst = m_workBuffer3; btOpenCLArray* srcHisto = m_workBuffer1; btOpenCLArray* destHisto = m_workBuffer2; int nWGs = NUM_WGS; btConstData cdata; { int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 int nBlocks = (n+blockSize-1)/(blockSize); cdata.m_n = n; cdata.m_nWGs = NUM_WGS; cdata.m_startBit = 0; cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; if( nBlocks < NUM_WGS ) { cdata.m_nBlocksPerWG = 1; nWGs = nBlocks; } } int count=0; for(int ib=0; ibsize()) { btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) }; btLauncherCL launcher(m_commandQueue, m_streamCountSortDataKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); int num = NUM_WGS*WG_SIZE; launcher.launch1D( num, WG_SIZE ); } #ifdef DEBUG_RADIXSORT btAlignedObjectArray testHist; srcHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;igetBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_prefixScanKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( 128, 128 ); destHisto = srcHisto; }else { //unsigned int sum; //for debugging m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); } #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;isize()) {// local sort and distribute btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )}; btLauncherCL launcher( m_commandQueue, m_sortAndScatterSortDataKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); } #else { #define NUM_TABLES 16 //#define SEQUENTIAL #ifdef SEQUENTIAL int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; int startBit = ib; destHisto->copyToHost(testHist); btAlignedObjectArray srcHost; btAlignedObjectArray dstHost; dstHost.resize(src->size()); src->copyToHost(srcHost); for (int i=0;i> startBit) & (NUM_TABLES-1); dstHost[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } #else int counter2[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; int tables[NUM_TABLES]; btAlignedObjectArray dstHostOK; dstHostOK.resize(src->size()); destHisto->copyToHost(testHist); btAlignedObjectArray srcHost; src->copyToHost(srcHost); int blockSize = 256; int nBlocksPerWG = cdata.m_nBlocksPerWG; int startBit = ib; { for (int i=0;i> startBit) & (NUM_TABLES-1); dstHostOK[tables[tableIdx] + counter2[tableIdx]] = srcHost[i]; counter2[tableIdx] ++; } } btAlignedObjectArray dstHost; dstHost.resize(src->size()); int counter[NUM_TABLES]={0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; for (int wgIdx=0;wgIdx> startBit) & (NUM_TABLES-1); int destIndex = testHist[tableIdx*NUM_WGS+wgIdx] + counter[tableIdx]; btSortData ok = dstHostOK[destIndex]; if (ok.m_key != srcHost[i].m_key) { printf("ok.m_key = %d, srcHost[i].m_key = %d\n", ok.m_key,srcHost[i].m_key ); printf("(ok.m_value = %d, srcHost[i].m_value = %d)\n", ok.m_value,srcHost[i].m_value ); } if (ok.m_value != srcHost[i].m_value) { printf("ok.m_value = %d, srcHost[i].m_value = %d\n", ok.m_value,srcHost[i].m_value ); printf("(ok.m_key = %d, srcHost[i].m_key = %d)\n", ok.m_key,srcHost[i].m_key ); } dstHost[destIndex] = srcHost[i]; counter[tableIdx] ++; } } } } } #endif //SEQUENTIAL dst->copyFromHost(dstHost); } #endif//USE_GPU #ifdef DEBUG_RADIXSORT destHisto->copyToHost(testHist); printf("ib = %d, testHist size = %d, non zero elements:\n",ib, testHist.size()); for (int i=0;isize()) { m_workBuffer4->resize(originalSize); keyValuesInOut.copyFromOpenCLArray(*m_workBuffer4); } #ifdef DEBUG_RADIXSORT keyValuesInOut.copyToHost(test2); printf("numElem = %d\n",test2.size()); for (int i=0;i& keysInOut, int sortBits /* = 32 */) { int originalSize = keysInOut.size(); int workingSize = originalSize; int dataAlignment = DATA_ALIGNMENT; btOpenCLArray* src = 0; if (workingSize%dataAlignment) { workingSize += dataAlignment-(workingSize%dataAlignment); m_workBuffer4a->copyFromOpenCLArray(keysInOut); m_workBuffer4a->resize(workingSize); unsigned int fillValue = 0xffffffff; m_fill->execute(*m_workBuffer4a,fillValue,workingSize-originalSize,originalSize); src = m_workBuffer4a; } else { src = &keysInOut; m_workBuffer4a->resize(0); } btAssert( workingSize%DATA_ALIGNMENT == 0 ); int minCap = NUM_BUCKET*NUM_WGS; int n = workingSize; m_workBuffer1->resize(minCap); m_workBuffer3->resize(workingSize); m_workBuffer3a->resize(workingSize); // ADLASSERT( ELEMENTS_PER_WORK_ITEM == 4 ); btAssert( BITS_PER_PASS == 4 ); btAssert( WG_SIZE == 64 ); btAssert( (sortBits&0x3) == 0 ); btOpenCLArray* dst = m_workBuffer3a; btOpenCLArray* srcHisto = m_workBuffer1; btOpenCLArray* destHisto = m_workBuffer2; int nWGs = NUM_WGS; btConstData cdata; { int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;//set at 256 int nBlocks = (n+blockSize-1)/(blockSize); cdata.m_n = n; cdata.m_nWGs = NUM_WGS; cdata.m_startBit = 0; cdata.m_nBlocksPerWG = (nBlocks + cdata.m_nWGs - 1)/cdata.m_nWGs; if( nBlocks < NUM_WGS ) { cdata.m_nBlocksPerWG = 1; nWGs = nBlocks; } } int count=0; for(int ib=0; ibsize()) { btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( srcHisto->getBufferCL() ) }; btLauncherCL launcher(m_commandQueue, m_streamCountKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); int num = NUM_WGS*WG_SIZE; launcher.launch1D( num, WG_SIZE ); } //fast prefix scan is not working properly on Mac OSX yet #ifdef _WIN32 bool fastScan=!m_deviceCPU; #else bool fastScan=false; #endif if (fastScan) {// prefix scan group histogram btBufferInfoCL bInfo[] = { btBufferInfoCL( srcHisto->getBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_prefixScanKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( 128, 128 ); destHisto = srcHisto; }else { //unsigned int sum; //for debugging m_scan->execute(*srcHisto,*destHisto,1920,0);//,&sum); } if (src->size()) {// local sort and distribute btBufferInfoCL bInfo[] = { btBufferInfoCL( src->getBufferCL(), true ), btBufferInfoCL( destHisto->getBufferCL(), true ), btBufferInfoCL( dst->getBufferCL() )}; btLauncherCL launcher( m_commandQueue, m_sortAndScatterKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( cdata ); launcher.launch1D( nWGs*WG_SIZE, WG_SIZE ); } btSwap(src, dst ); btSwap(srcHisto,destHisto); count++; } if (count&1) { btAssert(0);//need to copy from workbuffer to keyValuesInOut } if (m_workBuffer4a->size()) { m_workBuffer4a->resize(originalSize); keysInOut.copyFromOpenCLArray(*m_workBuffer4a); } }