diff --git a/Demos3/GpuDemos/broadphase/PairBench.cpp b/Demos3/GpuDemos/broadphase/PairBench.cpp index 9c445417c..53995fd65 100644 --- a/Demos3/GpuDemos/broadphase/PairBench.cpp +++ b/Demos3/GpuDemos/broadphase/PairBench.cpp @@ -9,6 +9,7 @@ #include "OpenGLWindow/b3gWindowInterface.h" #include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h" #include "Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h" #include "../GpuDemoInternalData.h" #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" @@ -108,6 +109,7 @@ static int curSelectedBroadphase = 0; static BroadphaseEntry allBroadphases[]= { {"Gpu Grid",b3GpuGridBroadphase::CreateFunc}, + {"Parallel Linear BVH",b3GpuParallelLinearBvhBroadphase::CreateFunc}, {"CPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceCpu}, {"GPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceGpu}, {"GPU 1-SAP Original",b3GpuSapBroadphase::CreateFuncOriginal}, diff --git a/build3/stringify.bat b/build3/stringify.bat index 7c65b8149..8876515c7 100644 --- a/build3/stringify.bat +++ b/build3/stringify.bat @@ -13,6 +13,7 @@ premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/Broadphas premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify diff --git a/build3/stringify_linux.sh b/build3/stringify_linux.sh index ed1627c27..751574af8 100755 --- a/build3/stringify_linux.sh +++ b/build3/stringify_linux.sh @@ -14,6 +14,7 @@ rem @echo off ./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify ./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify +./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h new file mode 100644 index 000000000..10fbeb79f --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -0,0 +1,416 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H +#define B3_GPU_PARALLEL_LINEAR_BVH_H + +//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" + +#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" + +#define B3_PLBVH_ROOT_NODE_MARKER -1 //Syncronize with parallelLinearBvh.cl + +///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame +///@remarks +///Main references: \n +///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n +///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n +///@par +///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: +/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) +/// - [fully parallel] Sort morton codes +/// - [somewhat parallel] Build radix binary tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Set internal node AABBs +///@par +///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages. +///The BVH implementation here is almost the same as [Karras 2012], but a different method is used for constructing the tree. +/// - Instead of building a binary radix tree, we simply pair each node with its nearest sibling. +/// This has the effect of further worsening the quality of the BVH, but the main spatial partitioning is done by the +/// Z-curve anyways, and this method should be simpler and faster during construction. Additionally, it is still possible +/// to improve the quality of the BVH by rearranging the connections between nodes. +/// - Due to the way the tree is constructed, it becomes unnecessary to use atomic_inc to get +/// the AABB for each internal node. Rather than traveling upwards from the leaf nodes, as in the paper, +/// each internal node checks its child nodes to get its AABB. +class b3GpuParallelLinearBvh +{ + cl_command_queue m_queue; + + cl_program m_parallelLinearBvhProgram; + + cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; + cl_kernel m_constructBinaryTreeKernel; + cl_kernel m_determineInternalNodeAabbsKernel; + + cl_kernel m_plbvhCalculateOverlappingPairsKernel; + + b3FillCL m_fill; + b3RadixSort32CL m_radixSorter; + + //1 element per level in the tree + b3AlignedObjectArray m_numNodesPerLevelCpu; //Level 0(m_numNodesPerLevelCpu[0]) is the root, last level contains the leaf nodes + b3AlignedObjectArray m_firstIndexOffsetPerLevelCpu; //Contains the index/offset of the first node in that level + b3OpenCLArray m_numNodesPerLevelGpu; + b3OpenCLArray m_firstIndexOffsetPerLevelGpu; + + //1 element per internal node (number_of_internal_nodes = number_of_leaves - 1); index 0 is the root node + b3OpenCLArray m_internalNodeAabbs; + b3OpenCLArray m_internalNodeChildNodes; //x == left child, y == right child + b3OpenCLArray m_internalNodeParentNodes; + + //1 element per leaf node + b3OpenCLArray m_leafNodeParentNodes; + b3OpenCLArray m_mortonCodesAndAabbIndicies; //m_key = morton code, m_value == aabb index + +public: + b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : + m_queue(queue), + m_fill(context, device, queue), + m_radixSorter(context, device, queue), + + m_numNodesPerLevelGpu(context, queue), + m_firstIndexOffsetPerLevelGpu(context, queue), + m_internalNodeAabbs(context, queue), + m_internalNodeChildNodes(context, queue), + m_internalNodeParentNodes(context, queue), + m_leafNodeParentNodes(context, queue), + m_mortonCodesAndAabbIndicies(context, queue) + { + const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; + + const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h + cl_int error; + char* additionalMacros = 0; + m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); + b3Assert(m_parallelLinearBvhProgram); + + m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); + m_constructBinaryTreeKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "constructBinaryTree", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_constructBinaryTreeKernel); + m_determineInternalNodeAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "determineInternalNodeAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_determineInternalNodeAabbsKernel); + + m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhCalculateOverlappingPairsKernel); + } + + virtual ~b3GpuParallelLinearBvh() + { + clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); + clReleaseKernel(m_constructBinaryTreeKernel); + clReleaseKernel(m_determineInternalNodeAabbsKernel); + + clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); + + clReleaseProgram(m_parallelLinearBvhProgram); + } + + // fix: need to handle/test case with 2 nodes + + ///@param cellsize A virtual grid of size 2^10^3 is used in the process of creating the BVH + void build(const b3OpenCLArray& worldSpaceAabbs, b3Scalar cellSize) + { + B3_PROFILE("b3ParallelLinearBvh::build()"); + + int numLeaves = worldSpaceAabbs.size(); //Number of leaves in the BVH == Number of rigid body AABBs + int numInternalNodes = numLeaves - 1; + + if(numLeaves < 2) return; + + // + { + m_internalNodeAabbs.resize(numInternalNodes); + m_internalNodeChildNodes.resize(numInternalNodes); + m_internalNodeParentNodes.resize(numInternalNodes); + + m_leafNodeParentNodes.resize(numLeaves); + m_mortonCodesAndAabbIndicies.resize(numLeaves); + } + + //Determine number of levels in the binary tree( numLevels = ceil( log2(numLeaves) ) ) + //The number of levels is equivalent to the number of bits needed to uniquely identify each node(including both internal and leaf nodes) + int numLevels = 0; + { + //Find the most significant bit(msb) + int mostSignificantBit = 0; + { + int temp = numLeaves; + while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1) + } + numLevels = mostSignificantBit + 1; + + //If the number of nodes is not a power of 2(as in, can be expressed as 2^N where N is an integer), then there is 1 additional level + if( ~(1 << mostSignificantBit) & numLeaves ) numLevels++; + + if(1) printf("numLeaves, numLevels, mostSignificantBit: %d, %d, %d \n", numLeaves, numLevels, mostSignificantBit); + } + + //Determine number of nodes per level, use prefix sum to get offsets of each level, and send to GPU + { + B3_PROFILE("Determine number of nodes per level"); + + m_numNodesPerLevelCpu.resize(numLevels); + + //The last level contains the leaf nodes; number of leaves is already known + if(numLevels - 1 >= 0) m_numNodesPerLevelCpu[numLevels - 1] = numLeaves; + + //Calculate number of nodes in each level; + //start from the second to last level(level right next to leaf nodes) and move towards the root(level 0) + int hasRemainder = 0; + for(int levelIndex = numLevels - 2; levelIndex >= 0; --levelIndex) + { + int numNodesPreviousLevel = m_numNodesPerLevelCpu[levelIndex + 1]; //For first iteration this == numLeaves + + bool allNodesAllocated = ( (numNodesPreviousLevel + hasRemainder) % 2 == 0 ); + + int numNodesCurrentLevel = (allNodesAllocated) ? (numNodesPreviousLevel + hasRemainder) / 2 : numNodesPreviousLevel / 2; + m_numNodesPerLevelCpu[levelIndex] = numNodesCurrentLevel; + + hasRemainder = static_cast(!allNodesAllocated); + } + + //Prefix sum to calculate the first index offset of each level + { + m_firstIndexOffsetPerLevelCpu = m_numNodesPerLevelCpu; + + //Perform inclusive scan + for(int i = 1; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) + m_firstIndexOffsetPerLevelCpu[i] += m_firstIndexOffsetPerLevelCpu[i - 1]; + + //Convert inclusive scan to exclusive scan to get the offsets + //This is equivalent to shifting each element in m_firstIndexOffsetPerLevelCpu[] by 1 to the right, + //and setting the first element to 0 + for(int i = 0; i < m_firstIndexOffsetPerLevelCpu.size(); ++i) + m_firstIndexOffsetPerLevelCpu[i] -= m_numNodesPerLevelCpu[i]; + } + + if(1) + { + int numInternalNodes = 0; + for(int i = 0; i < numLevels; ++i) + if(i < numLevels - 1) numInternalNodes += m_numNodesPerLevelCpu[i]; + printf("numInternalNodes: %d\n", numInternalNodes); + + for(int i = 0; i < numLevels; ++i) + printf("numNodes, offset[%d]: %d, %d \n", i, m_numNodesPerLevelCpu[i], m_firstIndexOffsetPerLevelCpu[i]); + printf("\n"); + } + + //Copy to GPU + m_numNodesPerLevelGpu.copyFromHost(m_numNodesPerLevelCpu, false); + m_firstIndexOffsetPerLevelGpu.copyFromHost(m_firstIndexOffsetPerLevelCpu, false); + clFinish(m_queue); + } + + //Find the AABB of all input AABBs; this is used to define the size of + //each cell in the virtual grid(2^10 cells in each dimension). + { + B3_PROFILE("Find AABB of merged nodes"); + + /*b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( m_allNodesMergedAabb.getBufferCL() ), + }; + + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabb, "m_findAllNodesMergedAabb"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue);*/ + } + + //Insert the center of the AABBs into a virtual grid, + //then convert the discrete grid coordinates into a morton code + //For each element in m_mortonCodesAndAabbIndicies, set + // m_key == morton code (value to sort by) + // m_value = AABB index + { + B3_PROFILE("Assign morton codes"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(cellSize); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + // + { + B3_PROFILE("Sort leaves by morton codes"); + + m_radixSorter.execute(m_mortonCodesAndAabbIndicies); + clFinish(m_queue); + } + + //Optional; only element at m_internalNodeParentNodes[0], the root node, needs to be set here + //as the parent indices of other nodes are overwritten during m_constructBinaryTreeKernel + { + B3_PROFILE("Reset parent node indices"); + + m_fill.execute( m_internalNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_internalNodeParentNodes.size() ); + m_fill.execute( m_leafNodeParentNodes, B3_PLBVH_ROOT_NODE_MARKER, m_leafNodeParentNodes.size() ); + clFinish(m_queue); + } + + //Construct binary tree; find the children of each internal node, and assign parent nodes + { + B3_PROFILE("Construct binary tree"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_constructBinaryTreeKernel, "m_constructBinaryTreeKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLevels); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + + if(1) + { + static b3AlignedObjectArray internalNodeChildNodes; + m_internalNodeChildNodes.copyToHost(internalNodeChildNodes, false); + clFinish(m_queue); + + for(int i = 0; i < 256; ++i) printf("ch[%d]: %d, %d\n", i, internalNodeChildNodes[i].x, internalNodeChildNodes[i].y); + printf("\n"); + } + } + + //For each internal node, check children to get its AABB; start from the + //last level and move towards the root + { + B3_PROFILE("Set AABBs"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_firstIndexOffsetPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_numNodesPerLevelGpu.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_determineInternalNodeAabbsKernel, "m_determineInternalNodeAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLevels); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + + if(0) + { + static b3AlignedObjectArray rigidAabbs; + worldSpaceAabbs.copyToHost(rigidAabbs, false); + clFinish(m_queue); + + b3SapAabb actualRootAabb; + actualRootAabb.m_minVec = b3MakeVector3(B3_LARGE_FLOAT, B3_LARGE_FLOAT, B3_LARGE_FLOAT); + actualRootAabb.m_maxVec = b3MakeVector3(-B3_LARGE_FLOAT, -B3_LARGE_FLOAT, -B3_LARGE_FLOAT); + for(int i = 0; i < rigidAabbs.size(); ++i) + { + actualRootAabb.m_minVec.setMin(rigidAabbs[i].m_minVec); + actualRootAabb.m_maxVec.setMax(rigidAabbs[i].m_maxVec); + } + printf("actualRootMin: %f, %f, %f \n", actualRootAabb.m_minVec.x, actualRootAabb.m_minVec.y, actualRootAabb.m_minVec.z); + printf("actualRootMax: %f, %f, %f \n", actualRootAabb.m_maxVec.x, actualRootAabb.m_maxVec.y, actualRootAabb.m_maxVec.z); + + b3SapAabb rootAabb = m_internalNodeAabbs.at(0); + printf("rootMin: %f, %f, %f \n", rootAabb.m_minVec.x, rootAabb.m_minVec.y, rootAabb.m_minVec.z); + printf("rootMax: %f, %f, %f \n", rootAabb.m_maxVec.x, rootAabb.m_maxVec.y, rootAabb.m_maxVec.z); + printf("\n"); + } + } + } + + //Max number of pairs is out_overlappingPairs.size() + //If the number of overlapping pairs is < out_overlappingPairs.size(), the array is resized + void calculateOverlappingPairs(const b3OpenCLArray& worldSpaceAabbs, + b3OpenCLArray& out_numPairs, b3OpenCLArray& out_overlappingPairs) + { + b3Assert( out_numPairs.size() == 1 ); + + int maxPairs = out_overlappingPairs.size(); + + int reset = 0; + out_numPairs.copyFromHostPointer(&reset, 1); + + { + B3_PROFILE("PLBVH calculateOverlappingPairs"); + + int numQueryAabbs = worldSpaceAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( out_numPairs.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + + // + int numPairs = -1; + out_numPairs.copyToHostPointer(&numPairs, 1); + if(numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + out_numPairs.copyFromHostPointer(&maxPairs, 1); + } + + out_overlappingPairs.resize(numPairs); + } +}; + +#endif diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h new file mode 100644 index 000000000..728215b7a --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -0,0 +1,98 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H +#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H + +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h" + +#include "b3GpuParallelLinearBvh.h" + +class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface +{ + b3GpuParallelLinearBvh m_plbvh; + + b3OpenCLArray m_overlappingPairsGpu; + b3OpenCLArray m_aabbsGpu; + b3OpenCLArray m_tempNumPairs; + + b3AlignedObjectArray m_aabbsCpu; + +public: + b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : + m_plbvh(context, device, queue), + + m_overlappingPairsGpu(context, queue), + m_aabbsGpu(context, queue), + m_tempNumPairs(context, queue) + { + m_tempNumPairs.resize(1); + } + virtual ~b3GpuParallelLinearBvhBroadphase() {} + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) + { + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + aabb.m_minIndices[3] = userPtr; + + m_aabbsCpu.push_back(aabb); + } + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) + { + b3Assert(0); //Not implemented + } + + virtual void calculateOverlappingPairs(int maxPairs) + { + //Detect overall min/max + { + //Not implemented + } + + //Reconstruct BVH + const b3Scalar CELL_SIZE(0.1); + m_plbvh.build(m_aabbsGpu, CELL_SIZE); + + // + m_overlappingPairsGpu.resize(maxPairs); + m_plbvh.calculateOverlappingPairs(m_aabbsGpu, m_tempNumPairs, m_overlappingPairsGpu); + } + virtual void calculateOverlappingPairsHost(int maxPairs) + { + b3Assert(0); //CPU version not implemented + } + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu() { m_aabbsGpu.copyFromHost(m_aabbsCpu); } + + virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } + virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); } + + virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } + virtual b3OpenCLArray& getAllAabbsGPU() { return m_aabbsGpu; } + + virtual b3AlignedObjectArray& getAllAabbsCPU() + { + b3Assert(0); //CPU version not implemented + return m_aabbsCpu; + } + + static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) + { + return new b3GpuParallelLinearBvhBroadphase(context, device, queue); + } +}; + +#endif diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl new file mode 100644 index 000000000..a5127c3aa --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl @@ -0,0 +1,399 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +typedef float b3Scalar; +typedef float4 b3Vector3; +#define b3Max max +#define b3Min min + +typedef struct +{ + unsigned int m_key; + unsigned int m_value; +} SortDataCL; + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} b3AabbCL; + + +unsigned int interleaveBits(unsigned int x) +{ + //........ ........ ......12 3456789A //x + //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits + + //........ ....1234 56789A12 3456789A //x |= (x << 10) + //........ ....1111 1....... ...11111 //0x 00 0F 80 1F + //........ ....1234 5....... ...6789A //x = ( x | (x << 10) ) & 0x000F801F; + + //.......1 23451234 5.....67 89A6789A //x |= (x << 5) + //.......1 1.....11 1.....11 .....111 //0x 01 83 83 07 + //.......1 2.....34 5.....67 .....89A //x = ( x | (x << 5) ) & 0x01838307; + + //....12.1 2..34534 5..67.67 ..89A89A //x |= (x << 3) + //....1... 1..1...1 1..1...1 ..1...11 //0x 08 91 91 23 + //....1... 2..3...4 5..6...7 ..8...9A //x = ( x | (x << 3) ) & 0x08919123; + + //...11..2 2.33..4N 5.66..77 .88..9NA //x |= (x << 1) ( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} ) + //....1..1 ..1...1. 1..1..1. .1...1.1 //0x 09 22 92 45 + //....1..2 ..3...4. 5..6..7. .8...9.A //x = ( x | (x << 1) ) & 0x09229245; + + //...11.22 .33..445 5.66.77. 88..99AA //x |= (x << 1) + //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 34 92 29 + //....1..2 ..3..4.. 5..6..7. .8..9..A //x = ( x | (x << 1) ) & 0x09349229; + + //........ ........ ......11 11111111 //0x000003FF + x &= 0x000003FF; //Clear all bits above bit 10 + + x = ( x | (x << 10) ) & 0x000F801F; + x = ( x | (x << 5) ) & 0x01838307; + x = ( x | (x << 3) ) & 0x08919123; + x = ( x | (x << 1) ) & 0x09229245; + x = ( x | (x << 1) ) & 0x09349229; + + return x; +} +unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z) +{ + return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2; +} + +/* +__kernel void findAllNodesMergedAabb(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* out_mergedAabb, int numAabbs) +{ + int aabbIndex = get_global_id(0); + if(aabbIndex >= numAabbs) return; + + //Find the most significant bit(msb) + int mostSignificantBit = 0; + { + int temp = numLeaves; + while(temp >>= 1) mostSignificantBit++; //Start counting from 0 (0 and 1 have msb 0, 2 has msb 1) + } + + int numberOfAabbsAboveMsbSplit = numAabbs & ~( ~(0) << mostSignificantBit ); // verify + int numRemainingAabbs = (1 << mostSignificantBit); + + //Merge AABBs above most significant bit so that the number of remaining AABBs is a power of 2 + //For example, if there are 159 AABBs = 128 + 31, then merge indices [0, 30] and 128 + [0, 30] + if(aabbIndex < numberOfAabbsAboveMsbSplit) + { + int otherAabbIndex = numRemainingAabbs + aabbIndex; + + b3AabbCL aabb = worldSpaceAabbs[aabbIndex]; + b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); + mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); + out_mergedAabb[aabbIndex] = mergedAabb; + } + + barrier(CLK_GLOBAL_MEM_FENCE); + + // + int offset = numRemainingAabbs / 2; + while(offset >= 1) + { + if(aabbIndex < offset) + { + int otherAabbIndex = aabbIndex + offset; + + b3AabbCL aabb = worldSpaceAabbs[aabbIndex]; + b3AabbCL otherAabb = worldSpaceAabbs[otherAabbIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); + mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); + out_mergedAabb[aabbIndex] = mergedAabb; + } + + offset = offset / 2; + + barrier(CLK_GLOBAL_MEM_FENCE); + } +} +*/ + +__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, + __global SortDataCL* out_mortonCodesAndAabbIndices, + b3Scalar cellSize, int numAabbs) +{ + int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index + if(leafNodeIndex >= numAabbs) return; + + b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex]; + + b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f; + + //Quantize into integer coordinates + //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size + b3Vector3 gridPosition = center / cellSize; + + int4 discretePosition; + discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) ); + discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) ); + discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) ); + + //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023] + discretePosition = b3Max( -512, b3Min(discretePosition, 511) ); + discretePosition += 512; + + //Interleave bits(assign a morton code, also known as a z-curve) + unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z); + + // + SortDataCL mortonCodeIndexPair; + mortonCodeIndexPair.m_key = mortonCode; + mortonCodeIndexPair.m_value = leafNodeIndex; + + out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair; +} + +#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128 +#define B3_PLBVH_ROOT_NODE_MARKER -1 //Used to indicate that the (root) node has no parent +#define B3_PLBVH_ROOT_NODE_INDEX 0 + +//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node. +//Positive index == leaf node, while negative index == internal node (remove negative sign to get index). +// +//Since the root internal node is at index 0, no internal nodes should reference it as a child, +//and so index 0 is always used to indicate a leaf node. +int isLeafNode(int index) { return (index >= 0); } +int getIndexWithInternalNodeMarkerRemoved(int index) { return (index >= 0) ? index : -index; } +int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : -index; } + +__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel, + __global int* numNodesPerLevel, + __global int2* out_internalNodeChildIndices, + __global int* out_internalNodeParentNodes, + __global int* out_leafNodeParentNodes, + int numLevels, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + //Find the level that this node is in, using linear search(could replace with binary search) + int level = 0; + int numInternalLevels = numLevels - 1; //All levels except the last are internal nodes + for(; level < numInternalLevels; ++level) + { + if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break; + } + + //Check lower levels to find child nodes + //Left child is always in the next level, but the same does not apply to the right child + int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level]; + int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1]; //Should never be out of bounds(see for loop above) + + int leftChildLevel = level + 1; + int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2; + + int rightChildLevel = level + 1; + int rightChildIndex = leftChildIndex + 1; + + //Under certain conditions, the right child index as calculated above is invalid; need to find the correct index + // + //First condition: must be at least 2 levels apart from the leaf node level; + //if the current level is right next to the leaf node level, then the right child + //will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access) + // + //Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid + // + //Third condition: must be the last node in its level + if( level < numLevels - 2 + && numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1] + && indexInLevel == numNodesPerLevel[level] - 1 ) + { + //Check lower levels until we find a node without a parent + for(; rightChildLevel < numLevels - 1; ++rightChildLevel) + { + int rightChildNextLevel = rightChildLevel + 1; + + //If this branch is taken, it means that the last node in rightChildNextLevel has no parent + if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] ) + { + //Set the node to the last node in rightChildNextLevel + rightChildLevel = rightChildNextLevel; + rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1; + break; + } + } + } + + int isLeftChildLeaf = (leftChildLevel >= numLevels - 1); + int isRightChildLeaf = (rightChildLevel >= numLevels - 1); + + //If left/right child is a leaf node, the index needs to be corrected + //the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array, + //with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays + { + int leafNodeLevel = numLevels - 1; + leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex; + rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex; + } + + //Set the negative sign bit if the node is internal + int2 childIndices; + childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex); + childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex); + out_internalNodeChildIndices[internalNodeIndex] = childIndices; + + //Assign parent node index to children + __global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes; + out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex; + + __global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes; + out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex; +} + +__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel, + __global int* numNodesPerLevel, + __global int2* internalNodeChildIndices, + __global SortDataCL* mortonCodesAndAabbIndices, + __global b3AabbCL* leafNodeAabbs, + __global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes) +{ + int i = get_global_id(0); + if(i >= numInternalNodes) return; + + int numInternalLevels = numLevels - 1; + + //Starting from the level next to the leaf nodes, move towards the root(level 0) + for(int level = numInternalLevels - 1; level >= 0; --level) + { + int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level] + + int numNodesInLevel = numNodesPerLevel[level]; + if(i < numNodesInLevel) + { + int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level]; + int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal]; + + int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x); + int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y); + + int isLeftChildLeaf = isLeafNode(childIndicies.x); + int isRightChildLeaf = isLeafNode(childIndicies.y); + + int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1; + int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1; + + b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex]; + b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex]; + + b3AabbCL internalNodeAabb; + internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min); + internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max); + out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb; + } + + barrier(CLK_GLOBAL_MEM_FENCE); + } +} + + +//From sap.cl +#define NEW_PAIR_MARKER -1 + +bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} +//From sap.cl + +__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, + __global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs, + __global SortDataCL* mortonCodesAndAabbIndices, + __global int* out_numPairs, __global int4* out_overlappingPairs, + int maxPairs, int numQueryAabbs) +{ +#define USE_SPATIALLY_COHERENT_INDICIES //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve +#ifdef USE_SPATIALLY_COHERENT_INDICIES + int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(queryRigidIndex >= numQueryAabbs) return; + + queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value; +#else + int queryRigidIndex = get_global_id(0); + if(queryRigidIndex >= numQueryAabbs) return; +#endif + + b3AabbCL queryAabb = rigidAabbs[queryRigidIndex]; + + int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE]; + + //Starting by placing only the root node index, 0, in the stack causes it to be detected as a leaf node(see isLeafNode() in loop) + int stackSize = 2; + stack[0] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].x; + stack[1] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].y; + + while(stackSize) + { + int internalOrLeafNodeIndex = stack[ stackSize - 1 ]; + --stackSize; + + int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false + int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); + + //bvhRigidIndex is not used if internal node + int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; + + b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex]; + if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) ) + { + if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3]) + { + int4 pair; + pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3]; + pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3]; + pair.z = NEW_PAIR_MARKER; + pair.w = NEW_PAIR_MARKER; + + int pairIndex = atomic_inc(out_numPairs); + if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair; + } + + if(!isLeaf) //Internal node + { + if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE) + { + //Error + } + else + { + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x; + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y; + } + } + } + + } +} + diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h new file mode 100644 index 000000000..a6cabdb99 --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -0,0 +1,325 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* parallelLinearBvhCL= \ +"/*\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose,\n" +"including commercial applications, and to alter it and redistribute it freely,\n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Initial Author Jackson Lee, 2014\n" +"typedef float b3Scalar;\n" +"typedef float4 b3Vector3;\n" +"#define b3Max max\n" +"#define b3Min min\n" +"typedef struct\n" +"{\n" +" unsigned int m_key;\n" +" unsigned int m_value;\n" +"} SortDataCL;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} b3AabbCL;\n" +"unsigned int interleaveBits(unsigned int x)\n" +"{\n" +" //........ ........ ......12 3456789A //x\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" +" \n" +" //........ ....1234 56789A12 3456789A //x |= (x << 10)\n" +" //........ ....1111 1....... ...11111 //0x 00 0F 80 1F\n" +" //........ ....1234 5....... ...6789A //x = ( x | (x << 10) ) & 0x000F801F; \n" +" \n" +" //.......1 23451234 5.....67 89A6789A //x |= (x << 5)\n" +" //.......1 1.....11 1.....11 .....111 //0x 01 83 83 07\n" +" //.......1 2.....34 5.....67 .....89A //x = ( x | (x << 5) ) & 0x01838307;\n" +" \n" +" //....12.1 2..34534 5..67.67 ..89A89A //x |= (x << 3)\n" +" //....1... 1..1...1 1..1...1 ..1...11 //0x 08 91 91 23\n" +" //....1... 2..3...4 5..6...7 ..8...9A //x = ( x | (x << 3) ) & 0x08919123;\n" +" \n" +" //...11..2 2.33..4N 5.66..77 .88..9NA //x |= (x << 1) ( N indicates overlapping bits, first overlap is bit {4,5} second is {9,A} )\n" +" //....1..1 ..1...1. 1..1..1. .1...1.1 //0x 09 22 92 45\n" +" //....1..2 ..3...4. 5..6..7. .8...9.A //x = ( x | (x << 1) ) & 0x09229245;\n" +" \n" +" //...11.22 .33..445 5.66.77. 88..99AA //x |= (x << 1)\n" +" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 34 92 29\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = ( x | (x << 1) ) & 0x09349229;\n" +" \n" +" //........ ........ ......11 11111111 //0x000003FF\n" +" x &= 0x000003FF; //Clear all bits above bit 10\n" +" \n" +" x = ( x | (x << 10) ) & 0x000F801F;\n" +" x = ( x | (x << 5) ) & 0x01838307;\n" +" x = ( x | (x << 3) ) & 0x08919123;\n" +" x = ( x | (x << 1) ) & 0x09229245;\n" +" x = ( x | (x << 1) ) & 0x09349229;\n" +" \n" +" return x;\n" +"}\n" +"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" +"{\n" +" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" +"}\n" +"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, \n" +" __global SortDataCL* out_mortonCodesAndAabbIndices, \n" +" b3Scalar cellSize, int numAabbs)\n" +"{\n" +" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" +" if(leafNodeIndex >= numAabbs) return;\n" +" \n" +" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" +" \n" +" b3Vector3 center = (aabb.m_min + aabb.m_max) * 0.5f;\n" +" \n" +" //Quantize into integer coordinates\n" +" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" +" b3Vector3 gridPosition = center / cellSize;\n" +" \n" +" int4 discretePosition;\n" +" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" +" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" +" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" +" \n" +" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" +" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" +" discretePosition += 512;\n" +" \n" +" //Interleave bits(assign a morton code, also known as a z-curve)\n" +" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" +" \n" +" //\n" +" SortDataCL mortonCodeIndexPair;\n" +" mortonCodeIndexPair.m_key = mortonCode;\n" +" mortonCodeIndexPair.m_value = leafNodeIndex;\n" +" \n" +" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" +"}\n" +"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" +"#define B3_PLBVH_ROOT_NODE_MARKER -1 //Used to indicate that the node has no parent \n" +"#define B3_PLBVH_ROOT_NODE_INDEX 0\n" +"//For elements of internalNodeChildIndices(int2), the negative bit determines whether it is a leaf or internal node.\n" +"//Positive index == leaf node, while negative index == internal node (remove negative sign to get index).\n" +"//\n" +"//Since the root internal node is at index 0, no internal nodes should reference it as a child,\n" +"//and so index 0 is always used to indicate a leaf node.\n" +"int isLeafNode(int index) { return (index >= 0); }\n" +"int getIndexWithInternalNodeMarkerRemoved(int index) { return (index >= 0) ? index : -index; }\n" +"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : -index; }\n" +"__kernel void constructBinaryTree(__global int* firstIndexOffsetPerLevel,\n" +" __global int* numNodesPerLevel,\n" +" __global int2* out_internalNodeChildIndices, \n" +" __global int* out_internalNodeParentNodes, \n" +" __global int* out_leafNodeParentNodes, \n" +" int numLevels, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //Find the level that this node is in, using linear search(could replace with binary search)\n" +" int level = 0;\n" +" int numInternalLevels = numLevels - 1; //All levels except the last are internal nodes\n" +" for(; level < numInternalLevels; ++level)\n" +" {\n" +" if( firstIndexOffsetPerLevel[level] <= internalNodeIndex && internalNodeIndex < firstIndexOffsetPerLevel[level + 1]) break;\n" +" }\n" +" \n" +" //Check lower levels to find child nodes\n" +" //Left child is always in the next level, but the same does not apply to the right child\n" +" int indexInLevel = internalNodeIndex - firstIndexOffsetPerLevel[level];\n" +" int firstIndexInNextLevel = firstIndexOffsetPerLevel[level + 1]; //Should never be out of bounds(see for loop above)\n" +" \n" +" int leftChildLevel = level + 1;\n" +" int leftChildIndex = firstIndexInNextLevel + indexInLevel * 2;\n" +" \n" +" int rightChildLevel = level + 1;\n" +" int rightChildIndex = leftChildIndex + 1;\n" +" \n" +" //Under certain conditions, the right child index as calculated above is invalid; need to find the correct index\n" +" //\n" +" //First condition: must be at least 2 levels apart from the leaf node level;\n" +" //if the current level is right next to the leaf node level, then the right child\n" +" //will never be invalid due to the way the nodes are allocated (also avoid a out-of-bounds memory access)\n" +" //\n" +" //Second condition: not enough nodes in the next level for each parent to have 2 children, so the right child is invalid\n" +" //\n" +" //Third condition: must be the last node in its level\n" +" if( level < numLevels - 2 \n" +" && numNodesPerLevel[level] * 2 > numNodesPerLevel[level + 1] \n" +" && indexInLevel == numNodesPerLevel[level] - 1 )\n" +" {\n" +" //Check lower levels until we find a node without a parent\n" +" for(; rightChildLevel < numLevels - 1; ++rightChildLevel)\n" +" {\n" +" int rightChildNextLevel = rightChildLevel + 1;\n" +" \n" +" //If this branch is taken, it means that the last node in rightChildNextLevel has no parent\n" +" if( numNodesPerLevel[rightChildLevel] * 2 < numNodesPerLevel[rightChildNextLevel] )\n" +" {\n" +" //Set the node to the last node in rightChildNextLevel\n" +" rightChildLevel = rightChildNextLevel;\n" +" rightChildIndex = firstIndexOffsetPerLevel[rightChildNextLevel] + numNodesPerLevel[rightChildNextLevel] - 1;\n" +" break;\n" +" }\n" +" }\n" +" }\n" +" \n" +" int isLeftChildLeaf = (leftChildLevel >= numLevels - 1);\n" +" int isRightChildLeaf = (rightChildLevel >= numLevels - 1);\n" +" \n" +" //If left/right child is a leaf node, the index needs to be corrected\n" +" //the way the index is calculated assumes that the leaf and internal nodes are in a contiguous array,\n" +" //with leaf nodes at the end of the array; in actuality, the leaf and internal nodes are in separate arrays\n" +" {\n" +" int leafNodeLevel = numLevels - 1;\n" +" leftChildIndex = (isLeftChildLeaf) ? leftChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : leftChildIndex;\n" +" rightChildIndex = (isLeftChildLeaf) ? rightChildIndex - firstIndexOffsetPerLevel[leafNodeLevel] : rightChildIndex;\n" +" }\n" +" \n" +" //Set the negative sign bit if the node is internal\n" +" int2 childIndices;\n" +" childIndices.x = getIndexWithInternalNodeMarkerSet(isLeftChildLeaf, leftChildIndex);\n" +" childIndices.y = getIndexWithInternalNodeMarkerSet(isRightChildLeaf, rightChildIndex);\n" +" out_internalNodeChildIndices[internalNodeIndex] = childIndices;\n" +" \n" +" //Assign parent node index to children\n" +" __global int* out_leftChildParentNodeIndices = (isLeftChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n" +" out_leftChildParentNodeIndices[leftChildIndex] = internalNodeIndex;\n" +" \n" +" __global int* out_rightChildParentNodeIndices = (isRightChildLeaf) ? out_leafNodeParentNodes : out_internalNodeParentNodes;\n" +" out_rightChildParentNodeIndices[rightChildIndex] = internalNodeIndex;\n" +"}\n" +"__kernel void determineInternalNodeAabbs(__global int* firstIndexOffsetPerLevel,\n" +" __global int* numNodesPerLevel, \n" +" __global int2* internalNodeChildIndices,\n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global b3AabbCL* leafNodeAabbs, \n" +" __global b3AabbCL* out_internalNodeAabbs, int numLevels, int numInternalNodes)\n" +"{\n" +" int i = get_global_id(0);\n" +" if(i >= numInternalNodes) return;\n" +" \n" +" int numInternalLevels = numLevels - 1;\n" +" \n" +" //Starting from the level next to the leaf nodes, move towards the root(level 0)\n" +" for(int level = numInternalLevels - 1; level >= 0; --level)\n" +" {\n" +" int indexInLevel = i; //Index relative to firstIndexOffsetPerLevel[level]\n" +" \n" +" int numNodesInLevel = numNodesPerLevel[level];\n" +" if(i < numNodesInLevel)\n" +" {\n" +" int internalNodeIndexGlobal = indexInLevel + firstIndexOffsetPerLevel[level];\n" +" int2 childIndicies = internalNodeChildIndices[internalNodeIndexGlobal];\n" +" \n" +" int leftChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.x);\n" +" int rightChildIndex = getIndexWithInternalNodeMarkerRemoved(childIndicies.y);\n" +" \n" +" int isLeftChildLeaf = isLeafNode(childIndicies.x);\n" +" int isRightChildLeaf = isLeafNode(childIndicies.y);\n" +" \n" +" int leftChildLeafIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" +" int rightChildLeafIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" +" \n" +" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftChildLeafIndex] : out_internalNodeAabbs[leftChildIndex];\n" +" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightChildLeafIndex] : out_internalNodeAabbs[rightChildIndex];\n" +" \n" +" b3AabbCL internalNodeAabb;\n" +" internalNodeAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" +" internalNodeAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" +" out_internalNodeAabbs[internalNodeIndexGlobal] = internalNodeAabb;\n" +" }\n" +" \n" +" barrier(CLK_GLOBAL_MEM_FENCE);\n" +" }\n" +"}\n" +"//From sap.cl\n" +"#define NEW_PAIR_MARKER -1\n" +"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"//From sap.cl\n" +"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" +" __global int2* internalNodeChildIndices, __global b3AabbCL* internalNodeAabbs,\n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" +" int maxPairs, int numQueryAabbs)\n" +"{\n" +" int queryRigidIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(queryRigidIndex >= numQueryAabbs) return;\n" +" \n" +" queryRigidIndex = mortonCodesAndAabbIndices[queryRigidIndex].m_value;\n" +" //int queryRigidIndex = get_global_id(0);\n" +" //if(queryRigidIndex >= numQueryAabbs) return;\n" +" \n" +" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" +" \n" +" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +" \n" +" //Starting by placing only the root node index, 0, in the stack causes it to be detected as a leaf node(see isLeafNode() in loop)\n" +" int stackSize = 2;\n" +" stack[0] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].x;\n" +" stack[1] = internalNodeChildIndices[B3_PLBVH_ROOT_NODE_INDEX].y;\n" +" \n" +" while(stackSize)\n" +" {\n" +" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +" --stackSize;\n" +" \n" +" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" +" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +" \n" +" //bvhRigidIndex is not used if internal node\n" +" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +" \n" +" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +" if( queryRigidIndex != bvhRigidIndex && TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" +" {\n" +" if(isLeaf && rigidAabbs[queryRigidIndex].m_minIndices[3] < rigidAabbs[bvhRigidIndex].m_minIndices[3])\n" +" {\n" +" int4 pair;\n" +" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" +" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +" pair.z = NEW_PAIR_MARKER;\n" +" pair.w = NEW_PAIR_MARKER;\n" +" \n" +" int pairIndex = atomic_inc(out_numPairs);\n" +" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +" }\n" +" \n" +" if(!isLeaf) //Internal node\n" +" {\n" +" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +" {\n" +" //Error\n" +" }\n" +" else\n" +" {\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +" }\n" +" }\n" +" }\n" +" \n" +" }\n" +"}\n" +;