From ef6be5370df84d0a992dfd64d2a06593a51be376 Mon Sep 17 00:00:00 2001 From: erwincoumans Date: Mon, 11 Nov 2013 02:19:04 -0800 Subject: [PATCH] more work towards hybrid of GPU grid and sap broadphase, separating small, large (moving) and static objects re-enable 'useNewBatchingKernel', it is slower but more robust --- Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp | 6 +- .../b3GpuGridBroadphase.cpp | 252 +++++++++++++----- .../BroadphaseCollision/b3GpuGridBroadphase.h | 9 +- .../kernels/gridBroadphaseKernels.h | 78 ++++-- src/Bullet3OpenCL/RigidBody/b3Solver.cpp | 2 +- 5 files changed, 244 insertions(+), 103 deletions(-) diff --git a/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp b/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp index 284fd31d2..661c77da9 100644 --- a/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp +++ b/Demos3/GpuDemos/rigidbody/GpuConvexScene.cpp @@ -45,11 +45,11 @@ void GpuConvexScene::setupScene(const ConstructionInfo& ci) m_data->m_rigidBodyPipeline->writeAllInstancesToGpu(); - float camPos[4]={ci.arraySizeX,ci.arraySizeY/2,ci.arraySizeZ,0}; + float camPos[4]={0,0,0,0};//ci.arraySizeX,ci.arraySizeY/2,ci.arraySizeZ,0}; //float camPos[4]={1,12.5,1.5,0}; m_instancingRenderer->setCameraTargetPosition(camPos); - m_instancingRenderer->setCameraDistance(114); + m_instancingRenderer->setCameraDistance(150); //m_instancingRenderer->setCameraYaw(85); m_instancingRenderer->setCameraYaw(30); m_instancingRenderer->setCameraPitch(225); @@ -158,7 +158,7 @@ int GpuConvexScene::createDynamicsObjects2(const ConstructionInfo& ci, const flo //mass=0.f; } b3Vector3 position = b3MakeVector3(((j+1)&1)+i*2.2,1+j*2.,((j+1)&1)+k*2.2); - //b3Vector3 position(i*2.2,10+j*1.9,k*2.2); + //b3Vector3 position = b3MakeVector3(i*2,1+j*2,k*2); //b3Vector3 position=b3MakeVector3(1,0.9,1); b3Quaternion orn(0,0,0,1); diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp index 97c730b2d..294452514 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -2,7 +2,7 @@ #include "b3GpuGridBroadphase.h" #include "Bullet3Geometry/b3AabbUtil.h" #include "kernels/gridBroadphaseKernels.h" - +#include "kernels/sapKernels.h" //#include "kernels/gridBroadphase.cl" @@ -11,29 +11,32 @@ - +#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" #define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" cl_kernel kCalcHashAABB; cl_kernel kClearCellStart; cl_kernel kFindCellStart; cl_kernel kFindOverlappingPairs; - - +cl_kernel m_copyAabbsKernel; +cl_kernel m_sap2Kernel; cl_kernel kFindPairsLarge; cl_kernel kComputePairCacheChanges; cl_kernel kSqueezeOverlappingPairBuff; -int maxPairsPerBody = 32; -int maxBodiesPerCell = 1024;//?? +int maxPairsPerBody = 64; +int maxBodiesPerCell = 256;//?? b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) :m_context(ctx), m_device(device), m_queue(q), -m_allAabbsGPU(ctx,q), +m_allAabbsGPU1(ctx,q), +m_largeAabbsGPU(ctx,q), +m_smallAabbsGPU(ctx,q), m_gpuPairs(ctx,q), + m_hashGpu(ctx,q), m_paramsGPU(ctx,q), m_cellStartGpu(ctx,q) @@ -55,30 +58,43 @@ m_cellStartGpu(ctx,q) m_paramsGPU.push_back(m_paramsCPU); cl_int errNum=0; - cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,0,&errNum,"",B3_GRID_BROADPHASE_PATH,true); - b3Assert(errNum==CL_SUCCESS); - kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); + { + const char* sapSrc = sapCL; + cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH); + b3Assert(errNum==CL_SUCCESS); + m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg ); + m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg ); + b3Assert(errNum==CL_SUCCESS); + } + + { + + cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,0,&errNum,"",B3_GRID_BROADPHASE_PATH,true); + b3Assert(errNum==CL_SUCCESS); + + kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); - kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); + kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); - kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); + kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); - kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); + kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); - kFindPairsLarge = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindPairsLarge",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); + kFindPairsLarge = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindPairsLarge",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); - kComputePairCacheChanges = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kComputePairCacheChanges",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); + kComputePairCacheChanges = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kComputePairCacheChanges",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); - kSqueezeOverlappingPairBuff = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kSqueezeOverlappingPairBuff",&errNum,gridProg); - b3Assert(errNum==CL_SUCCESS); + kSqueezeOverlappingPairBuff = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kSqueezeOverlappingPairBuff",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + } m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); @@ -89,7 +105,8 @@ b3GpuGridBroadphase::~b3GpuGridBroadphase() clReleaseKernel( kClearCellStart); clReleaseKernel( kFindCellStart); clReleaseKernel( kFindOverlappingPairs); - + clReleaseKernel( m_sap2Kernel); + clReleaseKernel( m_copyAabbsKernel); clReleaseKernel( kFindPairsLarge); clReleaseKernel( kComputePairCacheChanges); clReleaseKernel( kSqueezeOverlappingPairBuff); @@ -105,21 +122,29 @@ void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3 aabb.m_maxVec = aabbMax; aabb.m_minIndices[3] = userPtr; aabb.m_signedMaxIndices[3] = userPtr; - m_allAabbsCPU.push_back(aabb); + m_allAabbsCPU1.push_back(aabb); + m_smallAabbsCPU.push_back(aabb); } void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask) { - createProxy(aabbMin,aabbMax,userPtr,collisionFilterGroup,collisionFilterMask); - + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = userPtr; + m_allAabbsCPU1.push_back(aabb); + m_largeAabbsCPU.push_back(aabb); } void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) { B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs"); - /* - calculateOverlappingPairsHost(maxPairs); - { + + if (0) + { + calculateOverlappingPairsHost(maxPairs); + b3AlignedObjectArray cpuPairs; m_gpuPairs.copyToHost(cpuPairs); printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size()); @@ -128,19 +153,98 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y); } } - */ - //return; - int numAabbs = m_allAabbsGPU.size(); - if (numAabbs) + + //sync small AABBs { - m_hashGpu.resize(numAabbs); + int numSmallAabbs = m_smallAabbsGPU.size(); + if (numSmallAabbs) { + B3_PROFILE("copyAabbsKernelSmall"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU1.getBufferCL(), true ), + b3BufferInfoCL( m_smallAabbsGPU.getBufferCL()), + }; + + b3LauncherCL launcher(m_queue, m_copyAabbsKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numSmallAabbs ); + int num = numSmallAabbs; + launcher.launch1D( num); + } + } + + //sync large AABBs + { + int numLargeAabbs = m_largeAabbsGPU.size(); + + if (numLargeAabbs) + { + B3_PROFILE("copyAabbsKernelLarge"); + b3BufferInfoCL bInfo[] = { + b3BufferInfoCL( m_allAabbsGPU1.getBufferCL(), true ), + b3BufferInfoCL( m_largeAabbsGPU.getBufferCL()), + }; + + b3LauncherCL launcher(m_queue, m_copyAabbsKernel ); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numLargeAabbs ); + int num = numLargeAabbs; + launcher.launch1D( num); + clFinish(m_queue); + } + } + + + + + int numSmallAabbs = m_smallAabbsGPU.size(); + + b3OpenCLArray pairCount(m_context,m_queue); + pairCount.push_back(0); + m_gpuPairs.resize(numSmallAabbs*maxPairsPerBody); + + { + int numLargeAabbs = m_largeAabbsGPU.size(); + if (numLargeAabbs && numSmallAabbs) + { + B3_PROFILE("sap2Kernel"); + b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_largeAabbsGPU.getBufferCL() ), + b3BufferInfoCL( m_smallAabbsGPU.getBufferCL() ), + b3BufferInfoCL( m_gpuPairs.getBufferCL() ), + b3BufferInfoCL(pairCount.getBufferCL())}; + b3LauncherCL launcher(m_queue, m_sap2Kernel); + launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst( numLargeAabbs ); + launcher.setConst( numSmallAabbs); + launcher.setConst( 0 );//axis is not used + launcher.setConst( maxPairs ); + //@todo: use actual maximum work item sizes of the device instead of hardcoded values + launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64); + + int numPairs = pairCount.at(0); + if (numPairs >maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs =maxPairs; + } + } + } + + + + + if (numSmallAabbs) + { + B3_PROFILE("gridKernel"); + m_hashGpu.resize(numSmallAabbs); + { + B3_PROFILE("kCalcHashAABB"); b3LauncherCL launch(m_queue,kCalcHashAABB); - launch.setConst(numAabbs); - launch.setBuffer(m_allAabbsGPU.getBufferCL()); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_smallAabbsGPU.getBufferCL()); launch.setBuffer(m_hashGpu.getBufferCL()); launch.setBuffer(this->m_paramsGPU.getBufferCL()); - launch.launch1D(numAabbs); + launch.launch1D(numSmallAabbs); } m_sorter->execute(m_hashGpu); @@ -151,6 +255,7 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) { + B3_PROFILE("kClearCellStart"); b3LauncherCL launch(m_queue,kClearCellStart); launch.setConst(numCells); launch.setBuffer(m_cellStartGpu.getBufferCL()); @@ -162,47 +267,45 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) { - + B3_PROFILE("kFindCellStart"); b3LauncherCL launch(m_queue,kFindCellStart); - launch.setConst(numAabbs); + launch.setConst(numSmallAabbs); launch.setBuffer(m_hashGpu.getBufferCL()); launch.setBuffer(m_cellStartGpu.getBufferCL()); - launch.launch1D(numAabbs); + launch.launch1D(numSmallAabbs); //m_cellStartGpu.copyToHost(cellStartCpu); //printf("??\n"); } { - - + B3_PROFILE("kFindOverlappingPairs"); + b3OpenCLArray pairsGpu2(m_context,m_queue); b3OpenCLArray pairsGpu(m_context,m_queue); b3OpenCLArray pairStartCurGpu(m_context,m_queue); b3AlignedObjectArray pairStartCpu; - m_gpuPairs.resize(numAabbs*maxPairsPerBody); - pairsGpu2.resize(numAabbs*maxPairsPerBody); - pairsGpu.resize(numAabbs*maxPairsPerBody); - pairStartCurGpu.resize(numAabbs*2+2); + + pairsGpu2.resize(numSmallAabbs*maxPairsPerBody); + pairsGpu.resize(numSmallAabbs*maxPairsPerBody); + pairStartCurGpu.resize(numSmallAabbs*2+2); - pairStartCpu.resize(numAabbs*2+2); + pairStartCpu.resize(numSmallAabbs*2+2); pairStartCpu[0] = 0; pairStartCpu[1] = 0; - for(int i = 1; i <= numAabbs; i++) + for(int i = 1; i <= numSmallAabbs; i++) { pairStartCpu[i * 2] = pairStartCpu[(i-1) * 2] + maxPairsPerBody; pairStartCpu[i * 2 + 1] = 0; } pairStartCurGpu.copyFromHost(pairStartCpu); - b3OpenCLArray pairCount(m_context,m_queue); - pairCount.push_back(0); - + b3LauncherCL launch(m_queue,kFindOverlappingPairs); - launch.setConst(numAabbs); - launch.setBuffer(m_allAabbsGPU.getBufferCL()); + launch.setConst(numSmallAabbs); + launch.setBuffer(m_smallAabbsGPU.getBufferCL()); launch.setBuffer(m_hashGpu.getBufferCL()); launch.setBuffer(m_cellStartGpu.getBufferCL()); launch.setBuffer(pairsGpu.getBufferCL()); @@ -212,24 +315,27 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) launch.setBuffer(pairCount.getBufferCL()); launch.setBuffer(m_gpuPairs.getBufferCL()); - launch.launch1D(numAabbs); + launch.launch1D(numSmallAabbs); int actualCount = pairCount.at(0); m_gpuPairs.resize(actualCount); - /* - b3AlignedObjectArray pairsCpu; - m_gpuPairs.copyToHost(pairsCpu); - - printf("m_gpuPairs.size()=%d\n",m_gpuPairs.size()); - for (int i=0;i pairsCpu; + m_gpuPairs.copyToHost(pairsCpu); + + printf("m_gpuPairs.size()=%d\n",m_gpuPairs.size()); + for (int i=0;im_allAabbsGPU.getBufferCL(); + return this->m_allAabbsGPU1.getBufferCL(); } int b3GpuGridBroadphase::getNumOverlap() { @@ -295,10 +407,10 @@ cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer() b3OpenCLArray& b3GpuGridBroadphase::getAllAabbsGPU() { - return m_allAabbsGPU; + return m_allAabbsGPU1; } b3AlignedObjectArray& b3GpuGridBroadphase::getAllAabbsCPU() { - return m_allAabbsCPU; + return m_allAabbsCPU1; } \ No newline at end of file diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h index 84b4721ab..db5af0948 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -29,8 +29,13 @@ protected: cl_device_id m_device; cl_command_queue m_queue; - b3OpenCLArray m_allAabbsGPU; - b3AlignedObjectArray m_allAabbsCPU; + b3OpenCLArray m_allAabbsGPU1; + b3AlignedObjectArray m_allAabbsCPU1; + + b3OpenCLArray m_smallAabbsGPU; + b3AlignedObjectArray m_smallAabbsCPU; + b3OpenCLArray m_largeAabbsGPU; + b3AlignedObjectArray m_largeAabbsCPU; b3AlignedObjectArray m_hostPairs; b3OpenCLArray m_gpuPairs; diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h index bd693aae8..881d783c0 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h @@ -1,6 +1,5 @@ //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project static const char* gridBroadphaseCL= \ -"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" "int getPosHash(int4 gridPos, __global float4* pParams)\n" "{\n" " int4 gridDim = *((__global int4*)(pParams + 1));\n" @@ -47,7 +46,6 @@ static const char* gridBroadphaseCL= \ " __global int* pCellStart )\n" "{\n" " int index = get_global_id(0);\n" -" \n" " if(index >= numCells)\n" " {\n" " return;\n" @@ -68,7 +66,6 @@ static const char* gridBroadphaseCL= \ " sharedHash[get_local_id(0) + 1] = sortedData.x;\n" " if((index > 0) && (get_local_id(0) == 0))\n" " {\n" -" printf(\"%d sharedHash!\\n\", index);\n" " // first thread in block must load neighbor body hash\n" " sharedHash[0] = pHash[index-1].x;\n" " }\n" @@ -78,7 +75,6 @@ static const char* gridBroadphaseCL= \ " {\n" " if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" " {\n" -" printf(\"%d cellStart!\\n\", index);\n" " cellStart[sortedData.x] = index;\n" " }\n" " }\n" @@ -89,6 +85,7 @@ static const char* gridBroadphaseCL= \ " (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" " (min0.z <= max1.z)&& (min1.z <= max0.z); \n" "}\n" +"//search for AABB 'index' against other AABBs' in this cell\n" "void findPairsInCell( int numObjects,\n" " int4 gridPos,\n" " int index,\n" @@ -97,7 +94,10 @@ static const char* gridBroadphaseCL= \ " __global float4* pAABB, \n" " __global int* pPairBuff,\n" " __global int2* pPairBuffStartCurr,\n" -" __global float4* pParams)\n" +" __global float4* pParams,\n" +" volatile __global int* pairCount,\n" +" __global int4* pPairBuff2\n" +" )\n" "{\n" " int4 pGridDim = *((__global int4*)(pParams + 1));\n" " int maxBodiesPerCell = pGridDim.w;\n" @@ -129,40 +129,61 @@ static const char* gridBroadphaseCL= \ " break; // no longer in same bucket\n" " }\n" " int unsorted_indx2 = cellData.y;\n" -" if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" +" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" +" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n" " { \n" " float4 min1 = pAABB[unsorted_indx2*2 + 0];\n" " float4 max1 = pAABB[unsorted_indx2*2 + 1];\n" " if(testAABBOverlap(min0, max0, min1, max1))\n" " {\n" -" int handleIndex2 = as_int(min1.w);\n" -" int k;\n" -" for(k = 0; k < curr; k++)\n" +" if (pairCount)\n" " {\n" -" int old_pair = pPairBuff[start+k] & (~0x60000000);\n" -" if(old_pair == handleIndex2)\n" +" int handleIndex2 = as_int(min1.w);\n" +" if (handleIndex= curr_max) \n" -" { // not a good solution, but let's avoid crash\n" -" break;\n" +" int handleIndex2 = as_int(min1.w);\n" +" int k;\n" +" for(k = 0; k < curr; k++)\n" +" {\n" +" int old_pair = pPairBuff[start+k] & (~0x60000000);\n" +" if(old_pair == handleIndex2)\n" +" {\n" +" pPairBuff[start+k] |= 0x40000000;\n" +" break;\n" +" }\n" +" }\n" +" if(k == curr)\n" +" {\n" +" if(curr >= curr_max) \n" +" { // not a good solution, but let's avoid crash\n" +" break;\n" +" }\n" +" pPairBuff[start+curr] = handleIndex2 | 0x20000000;\n" +" curr++;\n" " }\n" -" pPairBuff[start+curr] = handleIndex2 | 0x20000000;\n" -" curr++;\n" " }\n" " }\n" " }\n" " }\n" -" int2 newStartCurr;\n" -" newStartCurr.x = start;\n" -" newStartCurr.y = curr;\n" -" pPairBuffStartCurr[handleIndex] = newStartCurr;\n" -" return;\n" +" if (!pairCount)\n" +" {\n" +" int2 newStartCurr;\n" +" newStartCurr.x = start;\n" +" newStartCurr.y = curr;\n" +" pPairBuffStartCurr[handleIndex] = newStartCurr;\n" +" }\n" +" \n" "}\n" "__kernel void kFindOverlappingPairs( int numObjects,\n" " __global float4* pAABB, \n" @@ -170,7 +191,10 @@ static const char* gridBroadphaseCL= \ " __global int* pCellStart, \n" " __global int* pPairBuff, \n" " __global int2* pPairBuffStartCurr, \n" -" __global float4* pParams )\n" +" __global float4* pParams ,\n" +" volatile __global int* pairCount,\n" +" __global int4* pPairBuff2\n" +" )\n" "{\n" " int index = get_global_id(0);\n" " if(index >= numObjects)\n" @@ -198,7 +222,7 @@ static const char* gridBroadphaseCL= \ " for(int x=-1; x<=1; x++) \n" " {\n" " gridPosB.x = gridPosA.x + x;\n" -" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams);\n" +" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams, pairCount,pPairBuff2);\n" " }\n" " }\n" " }\n" diff --git a/src/Bullet3OpenCL/RigidBody/b3Solver.cpp b/src/Bullet3OpenCL/RigidBody/b3Solver.cpp index 83625d585..6756f76a2 100644 --- a/src/Bullet3OpenCL/RigidBody/b3Solver.cpp +++ b/src/Bullet3OpenCL/RigidBody/b3Solver.cpp @@ -17,7 +17,7 @@ subject to the following restrictions: #include "b3Solver.h" ///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments -bool useNewBatchingKernel = false; +bool useNewBatchingKernel = true; bool convertConstraintOnCpu = false; #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"