diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h index 10fbeb79f..58ec34ab8 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -55,6 +55,7 @@ class b3GpuParallelLinearBvh cl_program m_parallelLinearBvhProgram; + cl_kernel m_findAllNodesMergedAabbKernel; cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; cl_kernel m_constructBinaryTreeKernel; cl_kernel m_determineInternalNodeAabbsKernel; @@ -78,6 +79,7 @@ class b3GpuParallelLinearBvh //1 element per leaf node b3OpenCLArray m_leafNodeParentNodes; b3OpenCLArray m_mortonCodesAndAabbIndicies; //m_key = morton code, m_value == aabb index + b3OpenCLArray m_mergedAabb; public: b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : @@ -91,7 +93,8 @@ public: m_internalNodeChildNodes(context, queue), m_internalNodeParentNodes(context, queue), m_leafNodeParentNodes(context, queue), - m_mortonCodesAndAabbIndicies(context, queue) + m_mortonCodesAndAabbIndicies(context, queue), + m_mergedAabb(context, queue) { const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; @@ -101,6 +104,8 @@ public: m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); b3Assert(m_parallelLinearBvhProgram); + m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findAllNodesMergedAabbKernel); m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros ); @@ -114,6 +119,7 @@ public: virtual ~b3GpuParallelLinearBvh() { + clReleaseKernel(m_findAllNodesMergedAabbKernel); clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); clReleaseKernel(m_constructBinaryTreeKernel); clReleaseKernel(m_determineInternalNodeAabbsKernel); @@ -125,8 +131,7 @@ public: // fix: need to handle/test case with 2 nodes - ///@param cellsize A virtual grid of size 2^10^3 is used in the process of creating the BVH - void build(const b3OpenCLArray& worldSpaceAabbs, b3Scalar cellSize) + void build(const b3OpenCLArray& worldSpaceAabbs) { B3_PROFILE("b3ParallelLinearBvh::build()"); @@ -143,6 +148,7 @@ public: m_leafNodeParentNodes.resize(numLeaves); m_mortonCodesAndAabbIndicies.resize(numLeaves); + m_mergedAabb.resize(numLeaves); } //Determine number of levels in the binary tree( numLevels = ceil( log2(numLeaves) ) ) @@ -160,7 +166,7 @@ public: //If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++; - if(1) printf("numLeaves, numLevels, mostSignificantBit: %d, %d, %d \n", numLeaves, numLevels, mostSignificantBit); + if(0) printf("numLeaves, numLevels, mostSignificantBit: %d, %d, %d \n", numLeaves, numLevels, mostSignificantBit); } //Determine number of nodes per level, use prefix sum to get offsets of each level, and send to GPU @@ -202,7 +208,7 @@ public: m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i]; } - if(1) + if(0) { int numInternalNodes = 0; for(int i = 0; i < numLevels; ++i) @@ -225,20 +231,22 @@ public: { B3_PROFILE("Find AABB of merged nodes"); - /*b3BufferInfoCL bufferInfo[] = + m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs); //Need to make a copy since the kernel modifies the array + + b3BufferInfoCL bufferInfo[] = { - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( m_allNodesMergedAabb.getBufferCL() ), + b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] }; - b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabb, "m_findAllNodesMergedAabb"); + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); launcher.setConst(numLeaves); launcher.launch1D(numLeaves); - clFinish(m_queue);*/ + clFinish(m_queue); } + //Insert the center of the AABBs into a virtual grid, //then convert the discrete grid coordinates into a morton code //For each element in m_mortonCodesAndAabbIndicies, set @@ -250,12 +258,12 @@ public: b3BufferInfoCL bufferInfo[] = { b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( m_mergedAabb.getBufferCL() ), b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) }; b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(cellSize); launcher.setConst(numLeaves); launcher.launch1D(numLeaves); @@ -301,7 +309,7 @@ public: launcher.launch1D(numInternalNodes); clFinish(m_queue); - if(1) + if(0) { static b3AlignedObjectArray internalNodeChildNodes; m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false); @@ -335,6 +343,12 @@ public: launcher.launch1D(numLeaves); clFinish(m_queue); + if(0) + { + b3SapAabb mergedAABB = m_mergedAabb.at(0); + printf("mergedAABBMin: %f, %f, %f \n", mergedAABB.m_minVec.x, mergedAABB.m_minVec.y, mergedAABB.m_minVec.z); + printf("mergedAABBMax: %f, %f, %f \n", mergedAABB.m_maxVec.x, mergedAABB.m_maxVec.y, mergedAABB.m_maxVec.z); + } if(0) { static b3AlignedObjectArray rigidAabbs; diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h index 728215b7a..007067460 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -56,14 +56,8 @@ public: virtual void calculateOverlappingPairs(int maxPairs) { - //Detect overall min/max - { - //Not implemented - } - //Reconstruct BVH - const b3Scalar CELL_SIZE(0.1); - m_plbvh.build(m_aabbsGpu, CELL_SIZE); + m_plbvh.build(m_aabbsGpu); // m_overlappingPairsGpu.resize(maxPairs); diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl index a5127c3aa..51c8ce281 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl @@ -80,8 +80,8 @@ unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z) return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2; } -/* -__kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* out_mergedAabb, int numAabbs) + +__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs) { int aabbIndex = get_global_id(0); if(aabbIndex >= numAabbs) return; @@ -89,11 +89,11 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa //Find the most significant bit(msb) int mostSignificantBit = 0; { - int temp = numLeaves; + int temp = numAabbs; while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1) } - int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit ); // verify + int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit ); int numRemainingAabbs = (1 << mostSignificantBit); //Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2 @@ -102,8 +102,8 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa { int otherAabbIndex = numRemainingAabbs + aabbIndex; - b3AabbCL aabb = worldSpaceAabbs[aabbIndex]; - b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex]; + b3AabbCL aabb = out_mergedAabb[aabbIndex]; + b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; b3AabbCL mergedAabb; mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); @@ -121,8 +121,8 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa { int otherAabbIndex = aabbIndex + offset; - b3AabbCL aabb = worldSpaceAabbs[aabbIndex]; - b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex]; + b3AabbCL aabb = out_mergedAabb[aabbIndex]; + b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; b3AabbCL mergedAabb; mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); @@ -130,27 +130,29 @@ __kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __globa out_mergedAabb[aabbIndex] = mergedAabb; } - offset = offset / 2; + offset /= 2; barrier(CLK_GLOBAL_MEM_FENCE); } } -*/ -__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, - __global SortDataCL* out_mortonCodesAndAabbIndices, - b3Scalar cellSize, int numAabbs) +__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, + __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs) { int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index if(leafNodeIndex >= numAabbs) return; - b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex]; + b3AabbCL mergedAabb = mergedAabbOfAllNodes[0]; + b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f; + b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024; - b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f; + b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex]; + b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f; + b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter; //Quantize into integer coordinates //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size - b3Vector3 gridPosition = center / cellSize; + b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize; int4 discretePosition; discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) ); diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h index a6cabdb99..fc3273bbc 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -75,20 +75,77 @@ static const char* parallelLinearBvhCL= \ "{\n" " return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" "}\n" -"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, \n" -" __global SortDataCL* out_mortonCodesAndAabbIndices, \n" -" b3Scalar cellSize, int numAabbs)\n" +"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbs)\n" +"{\n" +" int aabbIndex = get_global_id(0);\n" +" if(aabbIndex >= numAabbs) return;\n" +" \n" +" //Find the most significant bit(msb)\n" +" int mostSignificantBit = 0;\n" +" {\n" +" int temp = numAabbs;\n" +" while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1)\n" +" }\n" +" \n" +" int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit );\n" +" int numRemainingAabbs = (1 << mostSignificantBit);\n" +" \n" +" //Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2\n" +" //For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30]\n" +" if(aabbIndex < numberOfAabbsAboveMsbSplit)\n" +" {\n" +" int otherAabbIndex = numRemainingAabbs + aabbIndex;\n" +" \n" +" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" +" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" +" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" +" out_mergedAabb[aabbIndex] = mergedAabb;\n" +" }\n" +" \n" +" barrier(CLK_GLOBAL_MEM_FENCE);\n" +" \n" +" //\n" +" int offset = numRemainingAabbs / 2;\n" +" while(offset >= 1)\n" +" {\n" +" if(aabbIndex < offset)\n" +" {\n" +" int otherAabbIndex = aabbIndex + offset;\n" +" \n" +" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" +" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" +" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" +" out_mergedAabb[aabbIndex] = mergedAabb;\n" +" }\n" +" \n" +" offset /= 2;\n" +" \n" +" barrier(CLK_GLOBAL_MEM_FENCE);\n" +" }\n" +"}\n" +"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" +" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" "{\n" " int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" " if(leafNodeIndex >= numAabbs) return;\n" " \n" -" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" +" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" +" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" +" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" " \n" -" b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;\n" +" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" +" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" +" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" " \n" " //Quantize into integer coordinates\n" " //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" -" b3Vector3 gridPosition = center / cellSize;\n" +" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" " \n" " int4 discretePosition;\n" " discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" @@ -110,7 +167,7 @@ static const char* parallelLinearBvhCL= \ " out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" "}\n" "#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" -"#define B3_PLBVH_ROOT_NODE_MARKER -1 //Used to indicate that the node has no parent \n" +"#define B3_PLBVH_ROOT_NODE_MARKER -1 //Used to indicate that the (root) node has no parent \n" "#define B3_PLBVH_ROOT_NODE_INDEX 0\n" "//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node.\n" "//Positive index == leaf node, while negative index == internal node (remove negative sign to get index).\n" @@ -264,13 +321,16 @@ static const char* parallelLinearBvhCL= \ " __global int* out_numPairs, __global int4* out_overlappingPairs, \n" " int maxPairs, int numQueryAabbs)\n" "{\n" +"#define USE_SPATIALLY_COHERENT_INDICIES //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve\n" +"#ifdef USE_SPATIALLY_COHERENT_INDICIES\n" " int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" " if(queryRigidIndex >= numQueryAabbs) return;\n" " \n" " queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n" -" //int queryRigidIndex = get_global_id(0);\n" -" //if(queryRigidIndex >= numQueryAabbs) return;\n" -" \n" +"#else\n" +" int queryRigidIndex = get_global_id(0);\n" +" if(queryRigidIndex >= numQueryAabbs) return;\n" +"#endif\n" " b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" " \n" " int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"