#include "btGpuSapBroadphase.h" #include "BulletCommon/btVector3.h" #include "parallel_primitives/host/btLauncherCL.h" #include "BulletCommon/btQuickprof.h" #include "basic_initialize/btOpenCLUtils.h" #include "../kernels/sapKernels.h" #include "../kernels/sapFastKernels.h" #include "BulletCommon/btMinMax.h" btGpuSapBroadphase::btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) :m_context(ctx), m_device(device), m_queue(q), m_allAabbsGPU(ctx,q), m_smallAabbsGPU(ctx,q), m_largeAabbsGPU(ctx,q), m_overlappingPairs(ctx,q), m_gpuSmallSortData(ctx,q), m_gpuSmallSortedAabbs(ctx,q) { const char* sapSrc = sapCL; const char* sapFastSrc = sapFastCL; cl_int errNum=0; cl_program sapProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl"); btAssert(errNum==CL_SUCCESS); cl_program sapFastProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl"); btAssert(errNum==CL_SUCCESS); //m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); //m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg ); //m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); m_sap2Kernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); btAssert(errNum==CL_SUCCESS); #if 0 m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg ); btAssert(errNum==CL_SUCCESS); #else #ifndef __APPLE__ m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg ); btAssert(errNum==CL_SUCCESS); #else m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg ); btAssert(errNum==CL_SUCCESS); #endif #endif m_flipFloatKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg ); m_copyAabbsKernel= btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); m_scatterKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg ); m_sorter = new btRadixSort32CL(m_context,m_device,m_queue); } btGpuSapBroadphase::~btGpuSapBroadphase() { delete m_sorter; clReleaseKernel(m_scatterKernel); clReleaseKernel(m_flipFloatKernel); clReleaseKernel(m_copyAabbsKernel); clReleaseKernel(m_sapKernel); clReleaseKernel(m_sap2Kernel); } /// conservative test for overlap between two aabbs static bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aabbMax1, const btVector3 &aabbMin2, const btVector3 &aabbMax2) { bool overlap = true; overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap; overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap; overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap; return overlap; } void btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost) { int axis = 0;//todo on GPU for now hardcode btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size()); if (forceHost) { btAlignedObjectArray allHostAabbs; m_allAabbsGPU.copyToHost(allHostAabbs); { int numSmallAabbs = m_smallAabbsCPU.size(); for (int j=0;j hostPairs; { int numSmallAabbs = m_smallAabbsCPU.size(); for (int i=0;i allHostAabbs; m_allAabbsGPU.copyToHost(allHostAabbs); m_smallAabbsGPU.copyToHost(m_smallAabbsCPU); { int numSmallAabbs = m_smallAabbsCPU.size(); for (int j=0;j allHostAabbs; m_allAabbsGPU.copyToHost(allHostAabbs); m_largeAabbsGPU.copyToHost(m_largeAabbsCPU); { int numLargeAabbs = m_largeAabbsCPU.size(); for (int j=0;jexecute(m_gpuSmallSortData); clFinish(m_queue); } m_gpuSmallSortedAabbs.resize(numSmallAabbs); if (numSmallAabbs) { BT_PROFILE("scatterKernel"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),btBufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())}; btLauncherCL launcher(m_queue, m_scatterKernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numSmallAabbs); int num = numSmallAabbs; launcher.launch1D( num); clFinish(m_queue); } int maxPairsPerBody = 64; int maxPairs = maxPairsPerBody * numSmallAabbs;//todo m_overlappingPairs.resize(maxPairs); btOpenCLArray pairCount(m_context, m_queue); pairCount.push_back(0); int numPairs=0; { int numLargeAabbs = m_largeAabbsGPU.size(); if (numLargeAabbs && numSmallAabbs) { BT_PROFILE("sap2Kernel"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_largeAabbsGPU.getBufferCL() ),btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())}; btLauncherCL launcher(m_queue, m_sap2Kernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numLargeAabbs ); launcher.setConst( numSmallAabbs); launcher.setConst( axis ); launcher.setConst( maxPairs ); //@todo: use actual maximum work item sizes of the device instead of hardcoded values launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); numPairs = pairCount.at(0); if (numPairs >maxPairs) numPairs =maxPairs; } } if (m_gpuSmallSortedAabbs.size()) { BT_PROFILE("sapKernel"); btBufferInfoCL bInfo[] = { btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())}; btLauncherCL launcher(m_queue, m_sapKernel); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) ); launcher.setConst( numSmallAabbs ); launcher.setConst( axis ); launcher.setConst( maxPairs ); int num = numSmallAabbs; #if 0 int buffSize = launcher.getSerializationBufferSize(); unsigned char* buf = new unsigned char[buffSize+sizeof(int)]; for (int i=0;imaxPairs) numPairs = maxPairs; } #else int numPairs = 0; btLauncherCL launcher(m_queue, m_sapKernel); const char* fileName = "m_sapKernelArgs.bin"; FILE* f = fopen(fileName,"rb"); if (f) { int sizeInBytes=0; if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) { printf("error, cannot get file size\n"); exit(0); } unsigned char* buf = (unsigned char*) malloc(sizeInBytes); fread(buf,sizeInBytes,1,f); int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context); int num = *(int*)&buf[serializedBytes]; launcher.launch1D( num); btOpenCLArray pairCount(m_context, m_queue); int numElements = launcher.m_arrays[2]->size()/sizeof(int); pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements); numPairs = pairCount.at(0); //printf("overlapping pairs = %d\n",numPairs); btAlignedObjectArray hostOoverlappingPairs; btOpenCLArray tmpGpuPairs(m_context,m_queue); tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs ); tmpGpuPairs.copyToHost(hostOoverlappingPairs); m_overlappingPairs.copyFromHost(hostOoverlappingPairs); //printf("hello %d\n", m_overlappingPairs.size()); free(buf); fclose(f); } else { printf("error: cannot find file %s\n",fileName); } clFinish(m_queue); #endif m_overlappingPairs.resize(numPairs); }//BT_PROFILE("GPU_RADIX SORT"); } void btGpuSapBroadphase::writeAabbsToGpu() { m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU); m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU); } void btGpuSapBroadphase::createLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask) { int index = userPtr; btSapAabb aabb; for (int i=0;i<4;i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; } aabb.m_minIndices[3] = index; aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_largeAabbsCPU.push_back(aabb); m_allAabbsCPU.push_back(aabb); } void btGpuSapBroadphase::createProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask) { int index = userPtr; btSapAabb aabb; for (int i=0;i<4;i++) { aabb.m_min[i] = aabbMin[i]; aabb.m_max[i] = aabbMax[i]; } aabb.m_minIndices[3] = index; aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size(); m_smallAabbsCPU.push_back(aabb); m_allAabbsCPU.push_back(aabb); } cl_mem btGpuSapBroadphase::getAabbBuffer() { return m_allAabbsGPU.getBufferCL(); } int btGpuSapBroadphase::getNumOverlap() { return m_overlappingPairs.size(); } cl_mem btGpuSapBroadphase::getOverlappingPairBuffer() { return m_overlappingPairs.getBufferCL(); }