diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp new file mode 100644 index 000000000..c6169ee8e --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -0,0 +1,425 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + +#include "b3GpuParallelLinearBvh.h" + +b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : + m_queue(queue), + m_fill(context, device, queue), + m_radixSorter(context, device, queue), + + m_rootNodeIndex(context, queue), + + m_numNodesPerLevelGpu(context, queue), + m_firstIndexOffsetPerLevelGpu(context, queue), + + m_internalNodeAabbs(context, queue), + m_internalNodeLeafIndexRanges(context, queue), + m_internalNodeChildNodes(context, queue), + m_internalNodeParentNodes(context, queue), + + m_leafNodeParentNodes(context, queue), + m_mortonCodesAndAabbIndicies(context, queue), + m_mergedAabb(context, queue), + m_leafNodeAabbs(context, queue) +{ + m_rootNodeIndex.resize(1); + + // + const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; + + const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h + cl_int error; + char* additionalMacros = 0; + m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); + b3Assert(m_parallelLinearBvhProgram); + + m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findAllNodesMergedAabbKernel); + m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); + + m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_constructBinaryTreeKernel); + m_determineInternalNodeAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "determineInternalNodeAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_determineInternalNodeAabbsKernel); + + m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhCalculateOverlappingPairsKernel); + m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhRayTraverseKernel); +} + +b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() +{ + clReleaseKernel(m_findAllNodesMergedAabbKernel); + clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); + clReleaseKernel(m_constructBinaryTreeKernel); + clReleaseKernel(m_determineInternalNodeAabbsKernel); + + clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); + clReleaseKernel(m_plbvhRayTraverseKernel); + + clReleaseProgram(m_parallelLinearBvhProgram); +} + +void b3GpuParallelLinearBvh::build(const b3OpenCLArray& worldSpaceAabbs) +{ + B3_PROFILE("b3ParallelLinearBvh::build()"); + + m_leafNodeAabbs.copyFromOpenCLArray(worldSpaceAabbs); + + // + int numLeaves = m_leafNodeAabbs.size(); //Number of leaves in the BVH == Number of rigid body AABBs + int numInternalNodes = numLeaves - 1; + + if(numLeaves < 2) + { + int rootNodeIndex = numLeaves - 1; + m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); + return; + } + + // + { + m_internalNodeAabbs.resize(numInternalNodes); + m_internalNodeLeafIndexRanges.resize(numInternalNodes); + m_internalNodeChildNodes.resize(numInternalNodes); + m_internalNodeParentNodes.resize(numInternalNodes); + + m_leafNodeParentNodes.resize(numLeaves); + m_mortonCodesAndAabbIndicies.resize(numLeaves); + m_mergedAabb.resize(numLeaves); + } + + //Find the AABB of all input AABBs; this is used to define the size of + //each cell in the virtual grid(2^10 cells in each dimension). + { + B3_PROFILE("Find AABB of merged nodes"); + + m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs); //Need to make a copy since the kernel modifies the array + + for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; + numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] + }; + + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numAabbsNeedingMerge); + + launcher.launch1D(numAabbsNeedingMerge); + } + + clFinish(m_queue); + } + + + //Insert the center of the AABBs into a virtual grid, + //then convert the discrete grid coordinates into a morton code + //For each element in m_mortonCodesAndAabbIndicies, set + // m_key == morton code (value to sort by) + // m_value = AABB index + { + B3_PROFILE("Assign morton codes"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_mergedAabb.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + // + { + B3_PROFILE("Sort leaves by morton codes"); + + m_radixSorter.execute(m_mortonCodesAndAabbIndicies); + clFinish(m_queue); + } + + //Optional; only element at m_internalNodeParentNodes[0], the root node, needs to be set here + //as the parent indices of other nodes are overwritten during m_constructBinaryTreeKernel + { + B3_PROFILE("Reset parent node indices"); + + m_fill.execute( m_internalNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_internalNodeParentNodes.size() ); + m_fill.execute( m_leafNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_leafNodeParentNodes.size() ); + clFinish(m_queue); + } + + constructSimpleBinaryTree(); +} + +void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray& out_numPairs, b3OpenCLArray& out_overlappingPairs) +{ + b3Assert( out_numPairs.size() == 1 ); + + int maxPairs = out_overlappingPairs.size(); + + int reset = 0; + out_numPairs.copyFromHostPointer(&reset, 1); + + if( m_leafNodeAabbs.size() < 2 ) return; + + { + B3_PROFILE("PLBVH calculateOverlappingPairs"); + + int numQueryAabbs = m_leafNodeAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( out_numPairs.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + + // + int numPairs = -1; + out_numPairs.copyToHostPointer(&numPairs, 1); + if(numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + out_numPairs.copyFromHostPointer(&maxPairs, 1); + } + + out_overlappingPairs.resize(numPairs); +} + + +void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray& rays, + b3OpenCLArray& out_numRayRigidPairs, b3OpenCLArray& out_rayRigidPairs) +{ + B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); + + int numRays = rays.size(); + int maxRayRigidPairs = out_rayRigidPairs.size(); + + int reset = 0; + out_numRayRigidPairs.copyFromHostPointer(&reset, 1); + + if( m_leafNodeAabbs.size() < 1 ) return; + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( rays.getBufferCL() ), + + b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), + b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxRayRigidPairs); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_queue); + + + // + int numRayRigidPairs = -1; + out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); + + if(numRayRigidPairs > maxRayRigidPairs) + b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); + +} + +void b3GpuParallelLinearBvh::constructSimpleBinaryTree() +{ + int numLeaves = m_leafNodeAabbs.size(); //Number of leaves in the BVH == Number of rigid body AABBs + int numInternalNodes = numLeaves - 1; + + //Determine number of levels in the binary tree( numLevels = ceil( log2(numLeaves) ) ) + //The number of levels is equivalent to the number of bits needed to uniquely identify each node(including both internal and leaf nodes) + int numLevels = 0; + { + //Find the most significant bit(msb) + int mostSignificantBit = 0; + { + int temp = numLeaves; + while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1) + } + numLevels = mostSignificantBit + 1; + + //If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level + if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++; + } + + //Determine number of internal nodes per level, use prefix sum to get offsets of each level, and send to GPU + { + B3_PROFILE("Determine number of nodes per level"); + + m_numNodesPerLevelCpu.resize(numLevels); + + //The last level contains the leaf nodes; number of leaves is already known + if(numLevels - 1 >= 0) m_numNodesPerLevelCpu[numLevels - 1] = numLeaves; + + //Calculate number of nodes in each level; + //start from the second to last level(level right next to leaf nodes) and move towards the root(level 0) + int remainder = 0; + for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex) + { + int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves + int numNodesCurrentLevel = numNodesPreviousLevel / 2; + + remainder += numNodesPreviousLevel % 2; + if(remainder == 2) + { + numNodesCurrentLevel++; + remainder = 0; + } + + m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel; + } + + //Prefix sum to calculate the first index offset of each level + { + m_firstIndexOffsetPerLevelCpu = m_numNodesPerLevelCpu; + + //Perform inclusive scan + for(int i = 1; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) + m_firstIndexOffsetPerLevelCpu[i] += m_firstIndexOffsetPerLevelCpu[i - 1]; + + //Convert inclusive scan to exclusive scan to get the offsets + //This is equivalent to shifting each element in m_firstIndexOffsetPerLevelCpu[] by 1 to the right, + //and setting the first element to 0 + for(int i = 0; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) + m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i]; + } + + //Copy to GPU + m_numNodesPerLevelGpu.copyFromHost(m_numNodesPerLevelCpu, false); + m_firstIndexOffsetPerLevelGpu.copyFromHost(m_firstIndexOffsetPerLevelCpu, false); + clFinish(m_queue); + } + + //Construct binary tree; find the children of each internal node, and assign parent nodes + { + B3_PROFILE("Construct binary tree"); + + const int ROOT_NODE_INDEX = 0x80000000; //Default root index is 0, most significant bit is set to indicate internal node + m_rootNodeIndex.copyFromHostPointer(&ROOT_NODE_INDEX, 1); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_constructBinaryTreeKernel, "m_constructBinaryTreeKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLevels); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //For each internal node, check children to get its AABB; start from the + //last level, which contains the leaves, and move towards the root + { + B3_PROFILE("Set AABBs"); + + //Due to the arrangement of internal nodes, each internal node corresponds + //to a contiguous range of leaf node indices. This characteristic can be used + //to optimize calculateOverlappingPairs(); checking if + //(m_internalNodeLeafIndexRanges[].y < leafNodeIndex) can be used to ensure that + //each pair is processed only once. + { + B3_PROFILE("Reset internal node index ranges"); + + b3Int2 invalidIndexRange; + invalidIndexRange.x = -1; //x == min + invalidIndexRange.y = -2; //y == max + + m_fill.execute( m_internalNodeLeafIndexRanges, invalidIndexRange, m_internalNodeLeafIndexRanges.size() ); + clFinish(m_queue); + } + + int lastInternalLevelIndex = numLevels - 2; //Last level is leaf node level + for(int level = lastInternalLevelIndex; level >= 0; --level) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLevels); + launcher.setConst(numInternalNodes); + launcher.setConst(level); + + launcher.launch1D(numLeaves); + } + clFinish(m_queue); + } +} + + +void b3GpuParallelLinearBvh::constructRadixBinaryTree() +{ +} + + \ No newline at end of file diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h index 81d455044..89c6cedb7 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -20,8 +20,6 @@ subject to the following restrictions: #include "Bullet3Common/shared/b3Int4.h" #include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h" -#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" -#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" @@ -56,9 +54,12 @@ class b3GpuParallelLinearBvh cl_kernel m_findAllNodesMergedAabbKernel; cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; + + //Binary tree construction kernels cl_kernel m_constructBinaryTreeKernel; cl_kernel m_determineInternalNodeAabbsKernel; + //Traversal kernels cl_kernel m_plbvhCalculateOverlappingPairsKernel; cl_kernel m_plbvhRayTraverseKernel; @@ -87,405 +88,29 @@ class b3GpuParallelLinearBvh b3OpenCLArray m_leafNodeAabbs; public: - b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : - m_queue(queue), - m_fill(context, device, queue), - m_radixSorter(context, device, queue), - - m_rootNodeIndex(context, queue), - - m_numNodesPerLevelGpu(context, queue), - m_firstIndexOffsetPerLevelGpu(context, queue), - - m_internalNodeAabbs(context, queue), - m_internalNodeLeafIndexRanges(context, queue), - m_internalNodeChildNodes(context, queue), - m_internalNodeParentNodes(context, queue), - - m_leafNodeParentNodes(context, queue), - m_mortonCodesAndAabbIndicies(context, queue), - m_mergedAabb(context, queue), - m_leafNodeAabbs(context, queue) - { - m_rootNodeIndex.resize(1); + b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue); + virtual ~b3GpuParallelLinearBvh(); - // - const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; - - const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h - cl_int error; - char* additionalMacros = 0; - m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); - b3Assert(m_parallelLinearBvhProgram); - - m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_findAllNodesMergedAabbKernel); - m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); - m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_constructBinaryTreeKernel); - m_determineInternalNodeAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "determineInternalNodeAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_determineInternalNodeAabbsKernel); - - m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_plbvhCalculateOverlappingPairsKernel); - m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); - b3Assert(m_plbvhRayTraverseKernel); - } - - virtual ~b3GpuParallelLinearBvh() - { - clReleaseKernel(m_findAllNodesMergedAabbKernel); - clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); - clReleaseKernel(m_constructBinaryTreeKernel); - clReleaseKernel(m_determineInternalNodeAabbsKernel); - - clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); - clReleaseKernel(m_plbvhRayTraverseKernel); - - clReleaseProgram(m_parallelLinearBvhProgram); - } - - void build(const b3OpenCLArray& worldSpaceAabbs) - { - B3_PROFILE("b3ParallelLinearBvh::build()"); - - int numLeaves = worldSpaceAabbs.size(); //Number of leaves in the BVH == Number of rigid body AABBs - int numInternalNodes = numLeaves - 1; - - // - m_leafNodeAabbs.copyFromOpenCLArray(worldSpaceAabbs); - - if(numLeaves < 2) - { - int rootNodeIndex = numLeaves - 1; - m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); - return; - } - - // - { - m_internalNodeAabbs.resize(numInternalNodes); - m_internalNodeLeafIndexRanges.resize(numInternalNodes); - m_internalNodeChildNodes.resize(numInternalNodes); - m_internalNodeParentNodes.resize(numInternalNodes); - - m_leafNodeParentNodes.resize(numLeaves); - m_mortonCodesAndAabbIndicies.resize(numLeaves); - m_mergedAabb.resize(numLeaves); - } - - //Determine number of levels in the binary tree( numLevels = ceil( log2(numLeaves) ) ) - //The number of levels is equivalent to the number of bits needed to uniquely identify each node(including both internal and leaf nodes) - int numLevels = 0; - { - //Find the most significant bit(msb) - int mostSignificantBit = 0; - { - int temp = numLeaves; - while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1) - } - numLevels = mostSignificantBit + 1; - - //If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level - if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++; - } - - //Determine number of internal nodes per level, use prefix sum to get offsets of each level, and send to GPU - { - B3_PROFILE("Determine number of nodes per level"); - - m_numNodesPerLevelCpu.resize(numLevels); - - //The last level contains the leaf nodes; number of leaves is already known - if(numLevels - 1 >= 0) m_numNodesPerLevelCpu[numLevels - 1] = numLeaves; - - //Calculate number of nodes in each level; - //start from the second to last level(level right next to leaf nodes) and move towards the root(level 0) - int remainder = 0; - for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex) - { - int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves - int numNodesCurrentLevel = numNodesPreviousLevel / 2; - - remainder += numNodesPreviousLevel % 2; - if(remainder == 2) - { - numNodesCurrentLevel++; - remainder = 0; - } - - m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel; - } - - //Prefix sum to calculate the first index offset of each level - { - m_firstIndexOffsetPerLevelCpu = m_numNodesPerLevelCpu; - - //Perform inclusive scan - for(int i = 1; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) - m_firstIndexOffsetPerLevelCpu[i] += m_firstIndexOffsetPerLevelCpu[i - 1]; - - //Convert inclusive scan to exclusive scan to get the offsets - //This is equivalent to shifting each element in m_firstIndexOffsetPerLevelCpu[] by 1 to the right, - //and setting the first element to 0 - for(int i = 0; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) - m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i]; - } - - //Copy to GPU - m_numNodesPerLevelGpu.copyFromHost(m_numNodesPerLevelCpu, false); - m_firstIndexOffsetPerLevelGpu.copyFromHost(m_firstIndexOffsetPerLevelCpu, false); - clFinish(m_queue); - } - - //Find the AABB of all input AABBs; this is used to define the size of - //each cell in the virtual grid(2^10 cells in each dimension). - { - B3_PROFILE("Find AABB of merged nodes"); - - m_mergedAabb.copyFromOpenCLArray(worldSpaceAabbs); //Need to make a copy since the kernel modifies the array - - for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; - numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) - { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] - }; - - b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numAabbsNeedingMerge); - - launcher.launch1D(numAabbsNeedingMerge); - } - - clFinish(m_queue); - } - - - //Insert the center of the AABBs into a virtual grid, - //then convert the discrete grid coordinates into a morton code - //For each element in m_mortonCodesAndAabbIndicies, set - // m_key == morton code (value to sort by) - // m_value = AABB index - { - B3_PROFILE("Assign morton codes"); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( m_mergedAabb.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) - }; - - b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLeaves); - - launcher.launch1D(numLeaves); - clFinish(m_queue); - } - - // - { - B3_PROFILE("Sort leaves by morton codes"); - - m_radixSorter.execute(m_mortonCodesAndAabbIndicies); - clFinish(m_queue); - } - - //Optional; only element at m_internalNodeParentNodes[0], the root node, needs to be set here - //as the parent indices of other nodes are overwritten during m_constructBinaryTreeKernel - { - B3_PROFILE("Reset parent node indices"); - - m_fill.execute( m_internalNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_internalNodeParentNodes.size() ); - m_fill.execute( m_leafNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_leafNodeParentNodes.size() ); - clFinish(m_queue); - } - - //Construct binary tree; find the children of each internal node, and assign parent nodes - { - B3_PROFILE("Construct binary tree"); - - const int ROOT_NODE_INDEX = 0x80000000; //Default root index is 0, most significant bit is set to indicate internal node - m_rootNodeIndex.copyFromHostPointer(&ROOT_NODE_INDEX, 1); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), - b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ) - }; - - b3LauncherCL launcher(m_queue, m_constructBinaryTreeKernel, "m_constructBinaryTreeKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLevels); - launcher.setConst(numInternalNodes); - - launcher.launch1D(numInternalNodes); - clFinish(m_queue); - } - - //For each internal node, check children to get its AABB; start from the - //last level, which contains the leaves, and move towards the root - { - B3_PROFILE("Set AABBs"); - - //Due to the arrangement of internal nodes, each internal node corresponds - //to a contiguous range of leaf node indices. This characteristic can be used - //to optimize calculateOverlappingPairs(); checking if - //(m_internalNodeLeafIndexRanges[].y < leafNodeIndex) can be used to ensure that - //each pair is processed only once. - { - B3_PROFILE("Reset internal node index ranges"); - - b3Int2 invalidIndexRange; - invalidIndexRange.x = -1; //x == min - invalidIndexRange.y = -2; //y == max - - m_fill.execute( m_internalNodeLeafIndexRanges, invalidIndexRange, m_internalNodeLeafIndexRanges.size() ); - clFinish(m_queue); - } - - int lastInternalLevelIndex = numLevels - 2; //Last level is leaf node level - for(int level = lastInternalLevelIndex; level >= 0; --level) - { - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) - }; - - b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(numLevels); - launcher.setConst(numInternalNodes); - launcher.setConst(level); - - launcher.launch1D(numLeaves); - } - clFinish(m_queue); - } - } + void build(const b3OpenCLArray& worldSpaceAabbs); ///b3GpuParallelLinearBvh::build() must be called before this function. calculateOverlappingPairs() uses ///the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs. ///@param out_numPairs If number of pairs exceeds the max number of pairs, this is clamped to the max number. ///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs. ///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized. - void calculateOverlappingPairs(b3OpenCLArray& out_numPairs, b3OpenCLArray& out_overlappingPairs) - { - b3Assert( out_numPairs.size() == 1 ); - - int maxPairs = out_overlappingPairs.size(); - - int reset = 0; - out_numPairs.copyFromHostPointer(&reset, 1); - - if( m_leafNodeAabbs.size() < 2 ) return; - - { - B3_PROFILE("PLBVH calculateOverlappingPairs"); - - int numQueryAabbs = m_leafNodeAabbs.size(); - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - - b3BufferInfoCL( out_numPairs.getBufferCL() ), - b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) - }; - - b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(maxPairs); - launcher.setConst(numQueryAabbs); - - launcher.launch1D(numQueryAabbs); - clFinish(m_queue); - } - - - // - int numPairs = -1; - out_numPairs.copyToHostPointer(&numPairs, 1); - if(numPairs > maxPairs) - { - b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); - numPairs = maxPairs; - out_numPairs.copyFromHostPointer(&maxPairs, 1); - } - - out_overlappingPairs.resize(numPairs); - } + void calculateOverlappingPairs(b3OpenCLArray& out_numPairs, b3OpenCLArray& out_overlappingPairs); ///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections; ///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough. ///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index. ///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded. void testRaysAgainstBvhAabbs(const b3OpenCLArray& rays, - b3OpenCLArray& out_numRayRigidPairs, b3OpenCLArray& out_rayRigidPairs) - { - B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); - - int numRays = rays.size(); - int maxRayRigidPairs = out_rayRigidPairs.size(); - - int reset = 0; - out_numRayRigidPairs.copyFromHostPointer(&reset, 1); - - if( m_leafNodeAabbs.size() < 1 ) return; - - b3BufferInfoCL bufferInfo[] = - { - b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), - - b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), - b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), - b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), - b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), - b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), - - b3BufferInfoCL( rays.getBufferCL() ), - - b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), - b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) - }; - - b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); - launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); - launcher.setConst(maxRayRigidPairs); - launcher.setConst(numRays); - - launcher.launch1D(numRays); - clFinish(m_queue); - - - // - int numRayRigidPairs = -1; - out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); - - if(numRayRigidPairs > maxRayRigidPairs) - b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); - - } + b3OpenCLArray& out_numRayRigidPairs, b3OpenCLArray& out_rayRigidPairs); + +private: + void constructSimpleBinaryTree(); + + void constructRadixBinaryTree(); }; #endif diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp new file mode 100644 index 000000000..6d7293e2c --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp @@ -0,0 +1,52 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "b3GpuParallelLinearBvhBroadphase.h" + +b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : + m_plbvh(context, device, queue), + + m_overlappingPairsGpu(context, queue), + m_aabbsGpu(context, queue), + m_tempNumPairs(context, queue) +{ + m_tempNumPairs.resize(1); +} + +void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) +{ + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + aabb.m_minIndices[3] = userPtr; + + m_aabbsCpu.push_back(aabb); +} +void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) +{ + b3Assert(0); //Not implemented +} + +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs) +{ + //Reconstruct BVH + m_plbvh.build(m_aabbsGpu); + + // + m_overlappingPairsGpu.resize(maxPairs); + m_plbvh.calculateOverlappingPairs(m_tempNumPairs, m_overlappingPairsGpu); +} +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + b3Assert(0); //CPU version not implemented +} diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h index c5b85f4bb..2280b48ce 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -29,44 +29,14 @@ class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface b3AlignedObjectArray m_aabbsCpu; public: - b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : - m_plbvh(context, device, queue), - - m_overlappingPairsGpu(context, queue), - m_aabbsGpu(context, queue), - m_tempNumPairs(context, queue) - { - m_tempNumPairs.resize(1); - } + b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue); virtual ~b3GpuParallelLinearBvhBroadphase() {} - virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) - { - b3SapAabb aabb; - aabb.m_minVec = aabbMin; - aabb.m_maxVec = aabbMax; - aabb.m_minIndices[3] = userPtr; - - m_aabbsCpu.push_back(aabb); - } - virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) - { - b3Assert(0); //Not implemented - } + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask); - virtual void calculateOverlappingPairs(int maxPairs) - { - //Reconstruct BVH - m_plbvh.build(m_aabbsGpu); - - // - m_overlappingPairsGpu.resize(maxPairs); - m_plbvh.calculateOverlappingPairs(m_tempNumPairs, m_overlappingPairsGpu); - } - virtual void calculateOverlappingPairsHost(int maxPairs) - { - b3Assert(0); //CPU version not implemented - } + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); //call writeAabbsToGpu after done making all changes (createProxy etc) virtual void writeAabbsToGpu() { m_aabbsGpu.copyFromHost(m_aabbsCpu); }