From b0c43520e3c6ab8a0c011c96fb4615bf96f72b89 Mon Sep 17 00:00:00 2001 From: erwincoumans Date: Sun, 10 Nov 2013 22:00:35 -0800 Subject: [PATCH] more work towards re-enabling grid broadphase. --- Demos3/GpuDemos/GpuDemo.h | 6 +- .../GpuDemos/rigidbody/GpuRigidBodyDemo.cpp | 3 +- Demos3/SimpleOpenGL3/main.cpp | 1 + build3/stringify.bat | 4 + .../b3GpuGridBroadphase.cpp | 187 ++++++++- .../BroadphaseCollision/b3GpuGridBroadphase.h | 28 ++ .../kernels/gridBroadphase.cl | 374 ++++++++++++++++++ .../kernels/gridBroadphaseKernels.h | 331 ++++++++++++++++ 8 files changed, 929 insertions(+), 5 deletions(-) create mode 100644 src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl create mode 100644 src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h diff --git a/Demos3/GpuDemos/GpuDemo.h b/Demos3/GpuDemos/GpuDemo.h index f779f34cb..0cfe70ef0 100644 --- a/Demos3/GpuDemos/GpuDemo.h +++ b/Demos3/GpuDemos/GpuDemo.h @@ -48,9 +48,9 @@ public: arraySizeZ(10), #else - arraySizeX(1), - arraySizeY(1), - arraySizeZ(1), + arraySizeX(30), + arraySizeY(30), + arraySizeZ(30), #endif m_useConcaveMesh(false), gapX(16.3), diff --git a/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp b/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp index 26945c3d6..838ebcb5a 100644 --- a/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp +++ b/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp @@ -115,7 +115,8 @@ void GpuRigidBodyDemo::initPhysics(const ConstructionInfo& ci) m_data->m_config.m_maxConvexBodies = b3Max(m_data->m_config.m_maxConvexBodies,ci.arraySizeX*ci.arraySizeY*ci.arraySizeZ+10); m_data->m_config.m_maxConvexShapes = m_data->m_config.m_maxConvexBodies; - m_data->m_config.m_maxBroadphasePairs = 16*m_data->m_config.m_maxConvexBodies; + int maxPairsPerBody = 128; + m_data->m_config.m_maxBroadphasePairs = maxPairsPerBody*m_data->m_config.m_maxConvexBodies; m_data->m_config.m_maxContactCapacity = m_data->m_config.m_maxBroadphasePairs; diff --git a/Demos3/SimpleOpenGL3/main.cpp b/Demos3/SimpleOpenGL3/main.cpp index 31cce1a7f..4e8d9d142 100644 --- a/Demos3/SimpleOpenGL3/main.cpp +++ b/Demos3/SimpleOpenGL3/main.cpp @@ -1,6 +1,7 @@ #include "../../btgui/OpenGLWindow/SimpleOpenGL3App.h" #include "Bullet3Common/b3Vector3.h" #include "assert.h" +#include int main(int argc, char* argv[]) { diff --git a/build3/stringify.bat b/build3/stringify.bat index 5680be14c..8a0bb7392 100644 --- a/build3/stringify.bat +++ b/build3/stringify.bat @@ -12,6 +12,10 @@ premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/ParallelP premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h" --stringname="sapCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify + + + premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h" --stringname="satKernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl" --headerfile="../src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h" --stringname="primitiveContactsKernelsCL" stringify diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp index 7e022d3d8..354434f43 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -1,17 +1,99 @@ #include "b3GpuGridBroadphase.h" #include "Bullet3Geometry/b3AabbUtil.h" +#include "kernels/gridBroadphaseKernels.h" + +//#include "kernels/gridBroadphase.cl" + + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + + + + +#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" + +cl_kernel kCalcHashAABB; +cl_kernel kClearCellStart; +cl_kernel kFindCellStart; +cl_kernel kFindOverlappingPairs; + + +cl_kernel kFindPairsLarge; +cl_kernel kComputePairCacheChanges; +cl_kernel kSqueezeOverlappingPairBuff; + + +int maxPairsPerBody = 32; +int maxBodiesPerCell = 1024;//?? b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ) :m_context(ctx), m_device(device), m_queue(q), m_allAabbsGPU(ctx,q), -m_gpuPairs(ctx,q) +m_gpuPairs(ctx,q), +m_hashGpu(ctx,q), +m_paramsGPU(ctx,q), +m_cellStartGpu(ctx,q) { + + + b3Vector3 gridSize = b3MakeVector3(3,3,3); + b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]); + + m_paramsCPU.m_gridSize[0] = 128; + m_paramsCPU.m_gridSize[1] = 128; + m_paramsCPU.m_gridSize[2] = 128; + m_paramsCPU.m_gridSize[3] = maxBodiesPerCell; + m_paramsCPU.setMaxBodiesPerCell(maxBodiesPerCell); + m_paramsCPU.m_invCellSize[0] = invGridSize[0]; + m_paramsCPU.m_invCellSize[1] = invGridSize[1]; + m_paramsCPU.m_invCellSize[2] = invGridSize[2]; + m_paramsCPU.m_invCellSize[3] = 0.f; + m_paramsGPU.push_back(m_paramsCPU); + + cl_int errNum=0; + cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,0,&errNum,"",B3_GRID_BROADPHASE_PATH,true); + b3Assert(errNum==CL_SUCCESS); + + kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + + kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kFindPairsLarge = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindPairsLarge",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kComputePairCacheChanges = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kComputePairCacheChanges",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + kSqueezeOverlappingPairBuff = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kSqueezeOverlappingPairBuff",&errNum,gridProg); + b3Assert(errNum==CL_SUCCESS); + + m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue); + } b3GpuGridBroadphase::~b3GpuGridBroadphase() { + clReleaseKernel( kCalcHashAABB); + clReleaseKernel( kClearCellStart); + clReleaseKernel( kFindCellStart); + clReleaseKernel( kFindOverlappingPairs); + + clReleaseKernel( kFindPairsLarge); + clReleaseKernel( kComputePairCacheChanges); + clReleaseKernel( kSqueezeOverlappingPairBuff); + delete m_sorter; } @@ -34,6 +116,109 @@ void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Ve void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs) { calculateOverlappingPairsHost(maxPairs); + return; + int numAabbs = m_allAabbsGPU.size(); + if (numAabbs) + { + m_hashGpu.resize(numAabbs); + { + b3LauncherCL launch(m_queue,kCalcHashAABB); + launch.setConst(numAabbs); + launch.setBuffer(m_allAabbsGPU.getBufferCL()); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(this->m_paramsGPU.getBufferCL()); + launch.launch1D(numAabbs); + } + + m_sorter->execute(m_hashGpu); + + int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2]; + m_cellStartGpu.resize(numCells); + //b3AlignedObjectArray cellStartCpu; + + + { + b3LauncherCL launch(m_queue,kClearCellStart); + launch.setConst(numCells); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + launch.launch1D(numCells); + //m_cellStartGpu.copyToHost(cellStartCpu); + //printf("??\n"); + + } + + + { + + b3LauncherCL launch(m_queue,kFindCellStart); + launch.setConst(numAabbs); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + launch.launch1D(numAabbs); + //m_cellStartGpu.copyToHost(cellStartCpu); + //printf("??\n"); + + } + + { + + + b3OpenCLArray pairsGpu2(m_context,m_queue); + b3OpenCLArray pairsGpu(m_context,m_queue); + b3OpenCLArray pairStartCurGpu(m_context,m_queue); + b3AlignedObjectArray pairStartCpu; + + m_gpuPairs.resize(numAabbs*maxPairsPerBody); + pairsGpu2.resize(numAabbs*maxPairsPerBody); + pairsGpu.resize(numAabbs*maxPairsPerBody); + pairStartCurGpu.resize(numAabbs*2+2); + + pairStartCpu.resize(numAabbs*2+2); + + pairStartCpu[0] = 0; + pairStartCpu[1] = 0; + for(int i = 1; i <= numAabbs; i++) + { + pairStartCpu[i * 2] = pairStartCpu[(i-1) * 2] + maxPairsPerBody; + pairStartCpu[i * 2 + 1] = 0; + } + pairStartCurGpu.copyFromHost(pairStartCpu); + + b3OpenCLArray pairCount(m_context,m_queue); + pairCount.push_back(0); + + b3LauncherCL launch(m_queue,kFindOverlappingPairs); + launch.setConst(numAabbs); + launch.setBuffer(m_allAabbsGPU.getBufferCL()); + launch.setBuffer(m_hashGpu.getBufferCL()); + launch.setBuffer(m_cellStartGpu.getBufferCL()); + launch.setBuffer(pairsGpu.getBufferCL()); + launch.setBuffer(pairStartCurGpu.getBufferCL()); + launch.setBuffer(m_paramsGPU.getBufferCL()); + //launch.setBuffer(0); + launch.setBuffer(pairCount.getBufferCL()); + launch.setBuffer(m_gpuPairs.getBufferCL()); + + launch.launch1D(numAabbs); + + + + int actualCount = pairCount.at(0); + //b3AlignedObjectArray pairsCpu; + m_gpuPairs.resize(actualCount); + //m_gpuPairs.copyToHost(pairsCpu); + //printf("?!?\n"); + + } + + + } + + + + + + //calculateOverlappingPairsHost(maxPairs); } void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs) { diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h index 8bd21cbcb..84b4721ab 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -2,6 +2,25 @@ #define B3_GPU_GRID_BROADPHASE_H #include "b3GpuBroadphaseInterface.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" + +struct b3ParamsGridBroadphaseCL +{ + + float m_invCellSize[4]; + int m_gridSize[4]; + + int getMaxBodiesPerCell() const + { + return m_gridSize[3]; + } + + void setMaxBodiesPerCell(int maxOverlap) + { + m_gridSize[3] = maxOverlap; + } +}; + class b3GpuGridBroadphase : public b3GpuBroadphaseInterface { @@ -16,6 +35,15 @@ protected: b3AlignedObjectArray m_hostPairs; b3OpenCLArray m_gpuPairs; + b3OpenCLArray m_hashGpu; + b3OpenCLArray m_cellStartGpu; + + + b3ParamsGridBroadphaseCL m_paramsCPU; + b3OpenCLArray m_paramsGPU; + + class b3RadixSort32CL* m_sorter; + public: b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q ); diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl new file mode 100644 index 000000000..9d4daf07c --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl @@ -0,0 +1,374 @@ + + +int getPosHash(int4 gridPos, __global float4* pParams) +{ + int4 gridDim = *((__global int4*)(pParams + 1)); + gridPos.x &= gridDim.x - 1; + gridPos.y &= gridDim.y - 1; + gridPos.z &= gridDim.z - 1; + int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x; + return hash; +} + +int4 getGridPos(float4 worldPos, __global float4* pParams) +{ + int4 gridPos; + int4 gridDim = *((__global int4*)(pParams + 1)); + gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1); + gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1); + gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1); + return gridPos; +} + + +// calculate grid hash value for each body using its AABB +__kernel void kCalcHashAABB(int numObjects, __global float4* pAABB, __global int2* pHash, __global float4* pParams ) +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + float4 bbMin = pAABB[index*2]; + float4 bbMax = pAABB[index*2 + 1]; + float4 pos; + pos.x = (bbMin.x + bbMax.x) * 0.5f; + pos.y = (bbMin.y + bbMax.y) * 0.5f; + pos.z = (bbMin.z + bbMax.z) * 0.5f; + pos.w = 0.f; + // get address in grid + int4 gridPos = getGridPos(pos, pParams); + int gridHash = getPosHash(gridPos, pParams); + // store grid hash and body index + int2 hashVal; + hashVal.x = gridHash; + hashVal.y = index; + pHash[index] = hashVal; +} + +__kernel void kClearCellStart( int numCells, + __global int* pCellStart ) +{ + int index = get_global_id(0); + if(index >= numCells) + { + return; + } + pCellStart[index] = -1; +} + +__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart ) +{ + __local int sharedHash[513]; + int index = get_global_id(0); + int2 sortedData; + + if(index < numObjects) + { + sortedData = pHash[index]; + // Load hash data into shared memory so that we can look + // at neighboring body's hash value without loading + // two hash values per thread + sharedHash[get_local_id(0) + 1] = sortedData.x; + if((index > 0) && (get_local_id(0) == 0)) + { + // first thread in block must load neighbor body hash + sharedHash[0] = pHash[index-1].x; + } + } + barrier(CLK_LOCAL_MEM_FENCE); + if(index < numObjects) + { + if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)])) + { + cellStart[sortedData.x] = index; + } + } +} + +int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1) +{ + return (min0.x <= max1.x)&& (min1.x <= max0.x) && + (min0.y <= max1.y)&& (min1.y <= max0.y) && + (min0.z <= max1.z)&& (min1.z <= max0.z); +} + + + + +//search for AABB 'index' against other AABBs' in this cell +void findPairsInCell( int numObjects, + int4 gridPos, + int index, + __global int2* pHash, + __global int* pCellStart, + __global float4* pAABB, + __global int* pPairBuff, + __global int2* pPairBuffStartCurr, + __global float4* pParams, + volatile __global int* pairCount, + __global int4* pPairBuff2 + ) +{ + int4 pGridDim = *((__global int4*)(pParams + 1)); + int maxBodiesPerCell = pGridDim.w; + int gridHash = getPosHash(gridPos, pParams); + // get start of bucket for this cell + int bucketStart = pCellStart[gridHash]; + if (bucketStart == -1) + { + return; // cell empty + } + // iterate over bodies in this cell + int2 sortedData = pHash[index]; + int unsorted_indx = sortedData.y; + float4 min0 = pAABB[unsorted_indx*2 + 0]; + float4 max0 = pAABB[unsorted_indx*2 + 1]; + int handleIndex = as_int(min0.w); + int2 start_curr = pPairBuffStartCurr[handleIndex]; + int start = start_curr.x; + int curr = start_curr.y; + int2 start_curr_next = pPairBuffStartCurr[handleIndex+1]; + int curr_max = start_curr_next.x - start - 1; + int bucketEnd = bucketStart + maxBodiesPerCell; + bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd; + for(int index2 = bucketStart; index2 < bucketEnd; index2++) + { + int2 cellData = pHash[index2]; + if (cellData.x != gridHash) + { + break; // no longer in same bucket + } + int unsorted_indx2 = cellData.y; + //if (unsorted_indx2 < unsorted_indx) // check not colliding with self + if (unsorted_indx2 != unsorted_indx) // check not colliding with self + { + float4 min1 = pAABB[unsorted_indx2*2 + 0]; + float4 max1 = pAABB[unsorted_indx2*2 + 1]; + if(testAABBOverlap(min0, max0, min1, max1)) + { + if (pairCount) + { + int handleIndex2 = as_int(min1.w); + //if (handleIndex= curr_max) + { // not a good solution, but let's avoid crash + break; + } + pPairBuff[start+curr] = handleIndex2 | 0x20000000; + curr++; + } + } + } + } + } + if (!pairCount) + { + int2 newStartCurr; + newStartCurr.x = start; + newStartCurr.y = curr; + pPairBuffStartCurr[handleIndex] = newStartCurr; + } + +} + +__kernel void kFindOverlappingPairs( int numObjects, + __global float4* pAABB, + __global int2* pHash, + __global int* pCellStart, + __global int* pPairBuff, + __global int2* pPairBuffStartCurr, + __global float4* pParams , + volatile __global int* pairCount, + __global int4* pPairBuff2 + ) + +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + int2 sortedData = pHash[index]; + int unsorted_indx = sortedData.y; + float4 bbMin = pAABB[unsorted_indx*2 + 0]; + float4 bbMax = pAABB[unsorted_indx*2 + 1]; + float4 pos; + pos.x = (bbMin.x + bbMax.x) * 0.5f; + pos.y = (bbMin.y + bbMax.y) * 0.5f; + pos.z = (bbMin.z + bbMax.z) * 0.5f; + // get address in grid + int4 gridPosA = getGridPos(pos, pParams); + int4 gridPosB; + // examine only neighbouring cells + for(int z=0; z<=1; z++) + { + gridPosB.z = gridPosA.z + z; + for(int y=0; y<=1; y++) + { + gridPosB.y = gridPosA.y + y; + for(int x=0; x<=1; x++) + { + gridPosB.x = gridPosA.x + x; + findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams, pairCount,pPairBuff2); + } + } + } +} + + +__kernel void kFindPairsLarge( int numObjects, + __global float4* pAABB, + __global int2* pHash, + __global int* pCellStart, + __global int* pPairBuff, + __global int2* pPairBuffStartCurr, + uint numLarge ) +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + int2 sortedData = pHash[index]; + int unsorted_indx = sortedData.y; + float4 min0 = pAABB[unsorted_indx*2 + 0]; + float4 max0 = pAABB[unsorted_indx*2 + 1]; + int handleIndex = as_int(min0.w); + int2 start_curr = pPairBuffStartCurr[handleIndex]; + int start = start_curr.x; + int curr = start_curr.y; + int2 start_curr_next = pPairBuffStartCurr[handleIndex+1]; + int curr_max = start_curr_next.x - start - 1; + for(uint i = 0; i < numLarge; i++) + { + int indx2 = numObjects + i; + float4 min1 = pAABB[indx2*2 + 0]; + float4 max1 = pAABB[indx2*2 + 1]; + if(testAABBOverlap(min0, max0, min1, max1)) + { + int k; + int handleIndex2 = as_int(min1.w); + for(k = 0; k < curr; k++) + { + int old_pair = pPairBuff[start+k] & (~0x60000000); + if(old_pair == handleIndex2) + { + pPairBuff[start+k] |= 0x40000000; + break; + } + } + if(k == curr) + { + pPairBuff[start+curr] = handleIndex2 | 0x20000000; + if(curr >= curr_max) + { // not a good solution, but let's avoid crash + break; + } + curr++; + } + } + } + int2 newStartCurr; + newStartCurr.x = start; + newStartCurr.y = curr; + pPairBuffStartCurr[handleIndex] = newStartCurr; + return; +} + +__kernel void kComputePairCacheChanges( int numObjects, + __global int* pPairBuff, + __global int2* pPairBuffStartCurr, + __global int* pPairScan, + __global float4* pAABB ) +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + float4 bbMin = pAABB[index * 2]; + int handleIndex = as_int(bbMin.w); + int2 start_curr = pPairBuffStartCurr[handleIndex]; + int start = start_curr.x; + int curr = start_curr.y; + __global int *pInp = pPairBuff + start; + int num_changes = 0; + for(int k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & 0x40000000)) + { + num_changes++; + } + } + pPairScan[index+1] = num_changes; +} + +__kernel void kSqueezeOverlappingPairBuff( int numObjects, + __global int* pPairBuff, + __global int2* pPairBuffStartCurr, + __global int* pPairScan, + __global int* pPairOut, + __global float4* pAABB ) +{ + int index = get_global_id(0); + if(index >= numObjects) + { + return; + } + float4 bbMin = pAABB[index * 2]; + int handleIndex = as_int(bbMin.w); + int2 start_curr = pPairBuffStartCurr[handleIndex]; + int start = start_curr.x; + int curr = start_curr.y; + __global int* pInp = pPairBuff + start; + __global int* pOut = pPairOut + pPairScan[index+1]; + __global int* pOut2 = pInp; + int num = 0; + for(int k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & 0x40000000)) + { + *pOut = *pInp; + pOut++; + } + if((*pInp) & 0x60000000) + { + *pOut2 = (*pInp) & (~0x60000000); + pOut2++; + num++; + } + } + int2 newStartCurr; + newStartCurr.x = start; + newStartCurr.y = num; + pPairBuffStartCurr[handleIndex] = newStartCurr; +} + + + diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h new file mode 100644 index 000000000..bd693aae8 --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h @@ -0,0 +1,331 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* gridBroadphaseCL= \ +"#pragma OPENCL EXTENSION cl_amd_printf : enable\n" +"int getPosHash(int4 gridPos, __global float4* pParams)\n" +"{\n" +" int4 gridDim = *((__global int4*)(pParams + 1));\n" +" gridPos.x &= gridDim.x - 1;\n" +" gridPos.y &= gridDim.y - 1;\n" +" gridPos.z &= gridDim.z - 1;\n" +" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n" +" return hash;\n" +"} \n" +"int4 getGridPos(float4 worldPos, __global float4* pParams)\n" +"{\n" +" int4 gridPos;\n" +" int4 gridDim = *((__global int4*)(pParams + 1));\n" +" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n" +" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n" +" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n" +" return gridPos;\n" +"}\n" +"// calculate grid hash value for each body using its AABB\n" +"__kernel void kCalcHashAABB(int numObjects, __global float4* pAABB, __global int2* pHash, __global float4* pParams )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" float4 bbMin = pAABB[index*2];\n" +" float4 bbMax = pAABB[index*2 + 1];\n" +" float4 pos;\n" +" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +" pos.w = 0.f;\n" +" // get address in grid\n" +" int4 gridPos = getGridPos(pos, pParams);\n" +" int gridHash = getPosHash(gridPos, pParams);\n" +" // store grid hash and body index\n" +" int2 hashVal;\n" +" hashVal.x = gridHash;\n" +" hashVal.y = index;\n" +" pHash[index] = hashVal;\n" +"}\n" +"__kernel void kClearCellStart( int numCells, \n" +" __global int* pCellStart )\n" +"{\n" +" int index = get_global_id(0);\n" +" \n" +" if(index >= numCells)\n" +" {\n" +" return;\n" +" }\n" +" pCellStart[index] = -1;\n" +"}\n" +"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n" +"{\n" +" __local int sharedHash[513];\n" +" int index = get_global_id(0);\n" +" int2 sortedData;\n" +" if(index < numObjects)\n" +" {\n" +" sortedData = pHash[index];\n" +" // Load hash data into shared memory so that we can look \n" +" // at neighboring body's hash value without loading\n" +" // two hash values per thread\n" +" sharedHash[get_local_id(0) + 1] = sortedData.x;\n" +" if((index > 0) && (get_local_id(0) == 0))\n" +" {\n" +" printf(\"%d sharedHash!\\n\", index);\n" +" // first thread in block must load neighbor body hash\n" +" sharedHash[0] = pHash[index-1].x;\n" +" }\n" +" }\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" if(index < numObjects)\n" +" {\n" +" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n" +" {\n" +" printf(\"%d cellStart!\\n\", index);\n" +" cellStart[sortedData.x] = index;\n" +" }\n" +" }\n" +"}\n" +"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n" +"{\n" +" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n" +" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n" +" (min0.z <= max1.z)&& (min1.z <= max0.z); \n" +"}\n" +"void findPairsInCell( int numObjects,\n" +" int4 gridPos,\n" +" int index,\n" +" __global int2* pHash,\n" +" __global int* pCellStart,\n" +" __global float4* pAABB, \n" +" __global int* pPairBuff,\n" +" __global int2* pPairBuffStartCurr,\n" +" __global float4* pParams)\n" +"{\n" +" int4 pGridDim = *((__global int4*)(pParams + 1));\n" +" int maxBodiesPerCell = pGridDim.w;\n" +" int gridHash = getPosHash(gridPos, pParams);\n" +" // get start of bucket for this cell\n" +" int bucketStart = pCellStart[gridHash];\n" +" if (bucketStart == -1)\n" +" {\n" +" return; // cell empty\n" +" }\n" +" // iterate over bodies in this cell\n" +" int2 sortedData = pHash[index];\n" +" int unsorted_indx = sortedData.y;\n" +" float4 min0 = pAABB[unsorted_indx*2 + 0]; \n" +" float4 max0 = pAABB[unsorted_indx*2 + 1];\n" +" int handleIndex = as_int(min0.w);\n" +" int2 start_curr = pPairBuffStartCurr[handleIndex];\n" +" int start = start_curr.x;\n" +" int curr = start_curr.y;\n" +" int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];\n" +" int curr_max = start_curr_next.x - start - 1;\n" +" int bucketEnd = bucketStart + maxBodiesPerCell;\n" +" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n" +" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n" +" {\n" +" int2 cellData = pHash[index2];\n" +" if (cellData.x != gridHash)\n" +" {\n" +" break; // no longer in same bucket\n" +" }\n" +" int unsorted_indx2 = cellData.y;\n" +" if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n" +" { \n" +" float4 min1 = pAABB[unsorted_indx2*2 + 0];\n" +" float4 max1 = pAABB[unsorted_indx2*2 + 1];\n" +" if(testAABBOverlap(min0, max0, min1, max1))\n" +" {\n" +" int handleIndex2 = as_int(min1.w);\n" +" int k;\n" +" for(k = 0; k < curr; k++)\n" +" {\n" +" int old_pair = pPairBuff[start+k] & (~0x60000000);\n" +" if(old_pair == handleIndex2)\n" +" {\n" +" pPairBuff[start+k] |= 0x40000000;\n" +" break;\n" +" }\n" +" }\n" +" if(k == curr)\n" +" {\n" +" if(curr >= curr_max) \n" +" { // not a good solution, but let's avoid crash\n" +" break;\n" +" }\n" +" pPairBuff[start+curr] = handleIndex2 | 0x20000000;\n" +" curr++;\n" +" }\n" +" }\n" +" }\n" +" }\n" +" int2 newStartCurr;\n" +" newStartCurr.x = start;\n" +" newStartCurr.y = curr;\n" +" pPairBuffStartCurr[handleIndex] = newStartCurr;\n" +" return;\n" +"}\n" +"__kernel void kFindOverlappingPairs( int numObjects,\n" +" __global float4* pAABB, \n" +" __global int2* pHash, \n" +" __global int* pCellStart, \n" +" __global int* pPairBuff, \n" +" __global int2* pPairBuffStartCurr, \n" +" __global float4* pParams )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" int2 sortedData = pHash[index];\n" +" int unsorted_indx = sortedData.y;\n" +" float4 bbMin = pAABB[unsorted_indx*2 + 0];\n" +" float4 bbMax = pAABB[unsorted_indx*2 + 1];\n" +" float4 pos;\n" +" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n" +" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n" +" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n" +" // get address in grid\n" +" int4 gridPosA = getGridPos(pos, pParams);\n" +" int4 gridPosB; \n" +" // examine only neighbouring cells\n" +" for(int z=-1; z<=1; z++) \n" +" {\n" +" gridPosB.z = gridPosA.z + z;\n" +" for(int y=-1; y<=1; y++) \n" +" {\n" +" gridPosB.y = gridPosA.y + y;\n" +" for(int x=-1; x<=1; x++) \n" +" {\n" +" gridPosB.x = gridPosA.x + x;\n" +" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams);\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void kFindPairsLarge( int numObjects, \n" +" __global float4* pAABB, \n" +" __global int2* pHash, \n" +" __global int* pCellStart, \n" +" __global int* pPairBuff, \n" +" __global int2* pPairBuffStartCurr, \n" +" uint numLarge )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" int2 sortedData = pHash[index];\n" +" int unsorted_indx = sortedData.y;\n" +" float4 min0 = pAABB[unsorted_indx*2 + 0];\n" +" float4 max0 = pAABB[unsorted_indx*2 + 1];\n" +" int handleIndex = as_int(min0.w);\n" +" int2 start_curr = pPairBuffStartCurr[handleIndex];\n" +" int start = start_curr.x;\n" +" int curr = start_curr.y;\n" +" int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];\n" +" int curr_max = start_curr_next.x - start - 1;\n" +" for(uint i = 0; i < numLarge; i++)\n" +" {\n" +" int indx2 = numObjects + i;\n" +" float4 min1 = pAABB[indx2*2 + 0];\n" +" float4 max1 = pAABB[indx2*2 + 1];\n" +" if(testAABBOverlap(min0, max0, min1, max1))\n" +" {\n" +" int k;\n" +" int handleIndex2 = as_int(min1.w);\n" +" for(k = 0; k < curr; k++)\n" +" {\n" +" int old_pair = pPairBuff[start+k] & (~0x60000000);\n" +" if(old_pair == handleIndex2)\n" +" {\n" +" pPairBuff[start+k] |= 0x40000000;\n" +" break;\n" +" }\n" +" }\n" +" if(k == curr)\n" +" {\n" +" pPairBuff[start+curr] = handleIndex2 | 0x20000000;\n" +" if(curr >= curr_max) \n" +" { // not a good solution, but let's avoid crash\n" +" break;\n" +" }\n" +" curr++;\n" +" }\n" +" }\n" +" }\n" +" int2 newStartCurr;\n" +" newStartCurr.x = start;\n" +" newStartCurr.y = curr;\n" +" pPairBuffStartCurr[handleIndex] = newStartCurr;\n" +" return;\n" +"}\n" +"__kernel void kComputePairCacheChanges( int numObjects,\n" +" __global int* pPairBuff, \n" +" __global int2* pPairBuffStartCurr, \n" +" __global int* pPairScan, \n" +" __global float4* pAABB )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" float4 bbMin = pAABB[index * 2];\n" +" int handleIndex = as_int(bbMin.w);\n" +" int2 start_curr = pPairBuffStartCurr[handleIndex];\n" +" int start = start_curr.x;\n" +" int curr = start_curr.y;\n" +" __global int *pInp = pPairBuff + start;\n" +" int num_changes = 0;\n" +" for(int k = 0; k < curr; k++, pInp++)\n" +" {\n" +" if(!((*pInp) & 0x40000000))\n" +" {\n" +" num_changes++;\n" +" }\n" +" }\n" +" pPairScan[index+1] = num_changes;\n" +"} \n" +"__kernel void kSqueezeOverlappingPairBuff( int numObjects,\n" +" __global int* pPairBuff, \n" +" __global int2* pPairBuffStartCurr, \n" +" __global int* pPairScan,\n" +" __global int* pPairOut, \n" +" __global float4* pAABB )\n" +"{\n" +" int index = get_global_id(0);\n" +" if(index >= numObjects)\n" +" {\n" +" return;\n" +" }\n" +" float4 bbMin = pAABB[index * 2];\n" +" int handleIndex = as_int(bbMin.w);\n" +" int2 start_curr = pPairBuffStartCurr[handleIndex];\n" +" int start = start_curr.x;\n" +" int curr = start_curr.y;\n" +" __global int* pInp = pPairBuff + start;\n" +" __global int* pOut = pPairOut + pPairScan[index+1];\n" +" __global int* pOut2 = pInp;\n" +" int num = 0; \n" +" for(int k = 0; k < curr; k++, pInp++)\n" +" {\n" +" if(!((*pInp) & 0x40000000))\n" +" {\n" +" *pOut = *pInp;\n" +" pOut++;\n" +" }\n" +" if((*pInp) & 0x60000000)\n" +" {\n" +" *pOut2 = (*pInp) & (~0x60000000);\n" +" pOut2++;\n" +" num++;\n" +" }\n" +" }\n" +" int2 newStartCurr;\n" +" newStartCurr.x = start;\n" +" newStartCurr.y = num;\n" +" pPairBuffStartCurr[handleIndex] = newStartCurr;\n" +"}\n" +;