#include "btPrefixScanCL.h" #include "btFillCL.h" #define BT_PREFIXSCAN_PROG_PATH "opencl/parallel_primitives/kernels/PrefixScanKernels.cl" #include "btLauncherCL.h" #include "../../basic_initialize/b3OpenCLUtils.h" #include "../kernels/PrefixScanKernelsCL.h" btPrefixScanCL::btPrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size) :m_commandQueue(queue) { const char* scanKernelSource = prefixScanKernelsCL; cl_int pErrNum; char* additionalMacros=0; m_workBuffer = new btOpenCLArray(ctx,queue,size); cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, BT_PREFIXSCAN_PROG_PATH); btAssert(scanProg); m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros ); btAssert(m_localScanKernel ); m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros ); btAssert(m_blockSumKernel ); m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros ); btAssert(m_propagationKernel ); } btPrefixScanCL::~btPrefixScanCL() { delete m_workBuffer; clReleaseKernel(m_localScanKernel); clReleaseKernel(m_blockSumKernel); clReleaseKernel(m_propagationKernel); } template T btNextPowerOf2(T n) { n -= 1; for(int i=0; i>i); return n+1; } void btPrefixScanCL::execute(btOpenCLArray& src, btOpenCLArray& dst, int n, unsigned int* sum) { // btAssert( data->m_option == EXCLUSIVE ); const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) ); dst.resize(src.size()); m_workBuffer->resize(src.size()); btInt4 constBuffer; constBuffer.x = n; constBuffer.y = numBlocks; constBuffer.z = (int)btNextPowerOf2( numBlocks ); btOpenCLArray* srcNative = &src; btOpenCLArray* dstNative = &dst; { btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( srcNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_localScanKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( constBuffer ); launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE ); } { btBufferInfoCL bInfo[] = { btBufferInfoCL( m_workBuffer->getBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_blockSumKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( constBuffer ); launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE ); } if( numBlocks > 1 ) { btBufferInfoCL bInfo[] = { btBufferInfoCL( dstNative->getBufferCL() ), btBufferInfoCL( m_workBuffer->getBufferCL() ) }; btLauncherCL launcher( m_commandQueue, m_propagationKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( constBuffer ); launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE ); } if( sum ) { clFinish(m_commandQueue); dstNative->copyToHostPointer(sum,1,n-1,true); } } void btPrefixScanCL::executeHost(btAlignedObjectArray& src, btAlignedObjectArray& dst, int n, unsigned int* sum) { unsigned int s = 0; //if( data->m_option == EXCLUSIVE ) { for(int i=0; i