diff --git a/Demos3/GpuDemos/broadphase/PairBench.cpp b/Demos3/GpuDemos/broadphase/PairBench.cpp index 9c445417c..021f7e688 100644 --- a/Demos3/GpuDemos/broadphase/PairBench.cpp +++ b/Demos3/GpuDemos/broadphase/PairBench.cpp @@ -9,6 +9,7 @@ #include "OpenGLWindow/b3gWindowInterface.h" #include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h" #include "Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h" #include "../GpuDemoInternalData.h" #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" @@ -108,6 +109,7 @@ static int curSelectedBroadphase = 0; static BroadphaseEntry allBroadphases[]= { {"Gpu Grid",b3GpuGridBroadphase::CreateFunc}, + {"Parallel Linear BVH",b3GpuParallelLinearBvhBroadphase::CreateFunc}, {"CPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceCpu}, {"GPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceGpu}, {"GPU 1-SAP Original",b3GpuSapBroadphase::CreateFuncOriginal}, @@ -119,6 +121,7 @@ static BroadphaseEntry allBroadphases[]= struct PairBenchInternalData { b3GpuBroadphaseInterface* m_broadphaseGPU; + b3GpuBroadphaseInterface* m_validationBroadphase; cl_kernel m_moveObjectsKernel; cl_kernel m_sineWaveKernel; @@ -154,6 +157,8 @@ PairBench::PairBench() m_window(0) { m_data = new PairBenchInternalData; + + m_data->m_validationBroadphase = 0; } PairBench::~PairBench() { @@ -505,7 +510,16 @@ void PairBench::initPhysics(const ConstructionInfo& ci) m_data->m_sineWaveKernel = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"sineWaveKernel",&errNum,pairBenchProg); m_data->m_colorPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"colorPairsKernel2",&errNum,pairBenchProg); m_data->m_updateAabbSimple = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"updateAabbSimple",&errNum,pairBenchProg); - + + //Method for validating the overlapping pairs requires that the + //reference broadphase does not maintain internal state aside from AABB data. + //That is, overwriting the AABB state in the broadphase using + // b3GpuBroadphaseInterface::getAllAabbsGPU(), + // b3GpuBroadphaseInterface::getSmallAabbIndicesGPU(), and + // b3GpuBroadphaseInterface::getLargeAabbIndicesGPU() + //and then calling b3GpuBroadphaseInterface::calculateOverlappingPairs() should + //always produce the same result regardless of the current state of the broadphase. + m_data->m_validationBroadphase = b3GpuParallelLinearBvhBroadphase::CreateFunc(m_clData->m_clContext,m_clData->m_clDevice,m_clData->m_clQueue); } if (ci.m_window) @@ -741,6 +755,12 @@ void PairBench::deleteBroadphase() void PairBench::exitPhysics() { + if(m_data->m_validationBroadphase) + { + delete m_data->m_validationBroadphase; + m_data->m_validationBroadphase = 0; + } + #ifdef B3_USE_MIDI if (m_data->m_midiIn) { @@ -768,6 +788,17 @@ void PairBench::renderScene() m_instancingRenderer->renderScene(); } +struct OverlappingPairSortPredicate +{ + inline bool operator() (const b3Int4& a, const b3Int4& b) const + { + if(a.x != b.x) return (a.x < b.x); + if(a.y != b.y) return (a.y < b.y); + if(a.z != b.z) return (a.z < b.z); + return (a.w < b.w); + } +}; + void PairBench::clientMoveAndDisplay() { //color all objects blue @@ -901,7 +932,10 @@ void PairBench::clientMoveAndDisplay() } } - + + int prealloc = 3*1024*1024; + int maxOverlap = b3Min(prealloc,16*numObjects); + unsigned long dt = 0; if (numObjects) { @@ -910,16 +944,104 @@ void PairBench::clientMoveAndDisplay() B3_PROFILE("calculateOverlappingPairs"); int sz = sizeof(b3Int4)*64*numObjects; - int prealloc = 3*1024*1024; - - int maxOverlap = b3Min(prealloc,16*numObjects); m_data->m_broadphaseGPU->calculateOverlappingPairs(maxOverlap); int numPairs = m_data->m_broadphaseGPU->getNumOverlap(); //printf("numPairs = %d\n", numPairs); dt = cl.getTimeMicroseconds()-dt; + } + const bool VALIDATE_BROADPHASE = false; //Check that overlapping pairs of 2 broadphases are the same + if(numObjects && VALIDATE_BROADPHASE) + { + B3_PROFILE("validate broadphases"); + + { + B3_PROFILE("calculateOverlappingPairs m_validationBroadphase"); + //m_data->m_validationBroadphase->getAllAabbsCPU() = m_data->m_broadphaseGPU->getAllAabbsCPU(); + + m_data->m_validationBroadphase->getAllAabbsGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getAllAabbsGPU() ); + m_data->m_validationBroadphase->getSmallAabbIndicesGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getSmallAabbIndicesGPU() ); + m_data->m_validationBroadphase->getLargeAabbIndicesGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getLargeAabbIndicesGPU() ); + + m_data->m_validationBroadphase->calculateOverlappingPairs(maxOverlap); + } + + static b3AlignedObjectArray overlappingPairs; + static b3AlignedObjectArray overlappingPairsReference; + m_data->m_broadphaseGPU->getOverlappingPairsGPU().copyToHost(overlappingPairs); + m_data->m_validationBroadphase->getOverlappingPairsGPU().copyToHost(overlappingPairsReference); + + //Reorder pairs so that (pair.x < pair.y) is always true + { + B3_PROFILE("reorder pairs"); + + for(int i = 0; i < overlappingPairs.size(); ++i) + { + b3Int4 pair = overlappingPairs[i]; + if(pair.x > pair.y) + { + b3Swap(pair.x, pair.y); + b3Swap(pair.z, pair.w); + overlappingPairs[i] = pair; + } + } + for(int i = 0; i < overlappingPairsReference.size(); ++i) + { + b3Int4 pair = overlappingPairsReference[i]; + if(pair.x > pair.y) + { + b3Swap(pair.x, pair.y); + b3Swap(pair.z, pair.w); + overlappingPairsReference[i] = pair; + } + } + } + + // + { + B3_PROFILE("Sort overlapping pairs from most to least significant bit"); + + overlappingPairs.quickSort( OverlappingPairSortPredicate() ); + overlappingPairsReference.quickSort( OverlappingPairSortPredicate() ); + } + + //Compare + { + B3_PROFILE("compare pairs"); + + int numPairs = overlappingPairs.size(); + int numPairsReference = overlappingPairsReference.size(); + + bool success = true; + + if(numPairs == numPairsReference) + { + for(int i = 0; i < numPairsReference; ++i) + { + const b3Int4& pairA = overlappingPairs[i]; + const b3Int4& pairB = overlappingPairsReference[i]; + if( pairA.x != pairB.x + || pairA.y != pairB.y + || pairA.z != pairB.z + || pairA.w != pairB.w ) + { + b3Error("Error: one or more overlappingPairs differs from reference.\n"); + success = false; + break; + } + } + } + else + { + b3Error("Error: numPairs %d != numPairsReference %d \n", numPairs, numPairsReference); + success = false; + } + + printf("Broadphase validation: %d \n", success); + } + } if (m_data->m_gui) { diff --git a/Demos3/GpuDemos/raytrace/RaytracedShadowDemo.cpp b/Demos3/GpuDemos/raytrace/RaytracedShadowDemo.cpp index b114b4511..5c75d12e0 100644 --- a/Demos3/GpuDemos/raytrace/RaytracedShadowDemo.cpp +++ b/Demos3/GpuDemos/raytrace/RaytracedShadowDemo.cpp @@ -187,6 +187,10 @@ void GpuRaytraceScene::renderScene() void GpuRaytraceScene::renderScene2() { + //If using the BVH to accelerate raycasting, the AABBs need to be updated or else they will + //not match the actual rigid body positions after integration. The result is that rigid bodies + //are not drawn or appear clipped, especially if they are moving quickly. + m_data->m_rigidBodyPipeline->setupGpuAabbsFull(); // GpuBoxPlaneScene::renderScene(); // return; @@ -308,7 +312,7 @@ void GpuRaytraceScene::renderScene2() { B3_PROFILE("cast primary rays"); //m_raycaster->castRaysHost(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(),m_data->m_np->getInternalData()); - m_raycaster->castRays(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData()); + m_raycaster->castRays(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData(), m_data->m_bp); } @@ -350,7 +354,7 @@ void GpuRaytraceScene::renderScene2() { B3_PROFILE("cast shadow rays"); //m_raycaster->castRaysHost(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu()); - m_raycaster->castRays(shadowRays, shadowHits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData()); + m_raycaster->castRays(shadowRays, shadowHits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData(), m_data->m_bp); } { diff --git a/build3/stringify.bat b/build3/stringify.bat index 7c65b8149..8876515c7 100644 --- a/build3/stringify.bat +++ b/build3/stringify.bat @@ -13,6 +13,7 @@ premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/Broadphas premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify +premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify diff --git a/build3/stringify_linux.sh b/build3/stringify_linux.sh index ed1627c27..751574af8 100755 --- a/build3/stringify_linux.sh +++ b/build3/stringify_linux.sh @@ -14,6 +14,7 @@ rem @echo off ./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify ./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify +./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify diff --git a/docs/b3GpuParallelLinearBvh.pdf b/docs/b3GpuParallelLinearBvh.pdf new file mode 100644 index 000000000..55327bff9 Binary files /dev/null and b/docs/b3GpuParallelLinearBvh.pdf differ diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h index 3598ffb56..09e271e6b 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h @@ -34,6 +34,10 @@ public: virtual b3OpenCLArray& getAllAabbsGPU()=0; virtual b3AlignedObjectArray& getAllAabbsCPU()=0; + + virtual b3OpenCLArray& getOverlappingPairsGPU() = 0; + virtual b3OpenCLArray& getSmallAabbIndicesGPU() = 0; + virtual b3OpenCLArray& getLargeAabbIndicesGPU() = 0; }; diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp index f5308ecf4..deb6d89d9 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp @@ -366,4 +366,18 @@ b3OpenCLArray& b3GpuGridBroadphase::getAllAabbsGPU() b3AlignedObjectArray& b3GpuGridBroadphase::getAllAabbsCPU() { return m_allAabbsCPU1; -} \ No newline at end of file +} + +b3OpenCLArray& b3GpuGridBroadphase::getOverlappingPairsGPU() +{ + return m_gpuPairs; +} +b3OpenCLArray& b3GpuGridBroadphase::getSmallAabbIndicesGPU() +{ + return m_smallAabbsMappingGPU; +} +b3OpenCLArray& b3GpuGridBroadphase::getLargeAabbIndicesGPU() +{ + return m_largeAabbsMappingGPU; +} + diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h index 4dd5c3a3c..9694a362b 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h @@ -78,6 +78,10 @@ public: virtual b3OpenCLArray& getAllAabbsGPU(); virtual b3AlignedObjectArray& getAllAabbsCPU(); + + virtual b3OpenCLArray& getOverlappingPairsGPU(); + virtual b3OpenCLArray& getSmallAabbIndicesGPU(); + virtual b3OpenCLArray& getLargeAabbIndicesGPU(); }; diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp new file mode 100644 index 000000000..641df9eb1 --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp @@ -0,0 +1,577 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" + +#include "b3GpuParallelLinearBvh.h" + +b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : + m_queue(queue), + m_radixSorter(context, device, queue), + + m_rootNodeIndex(context, queue), + m_maxDistanceFromRoot(context, queue), + m_temp(context, queue), + + m_internalNodeAabbs(context, queue), + m_internalNodeLeafIndexRanges(context, queue), + m_internalNodeChildNodes(context, queue), + m_internalNodeParentNodes(context, queue), + + m_commonPrefixes(context, queue), + m_commonPrefixLengths(context, queue), + m_distanceFromRoot(context, queue), + + m_leafNodeParentNodes(context, queue), + m_mortonCodesAndAabbIndicies(context, queue), + m_mergedAabb(context, queue), + m_leafNodeAabbs(context, queue), + + m_largeAabbs(context, queue) +{ + m_rootNodeIndex.resize(1); + m_maxDistanceFromRoot.resize(1); + m_temp.resize(1); + + // + const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl"; + + const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h + cl_int error; + char* additionalMacros = 0; + m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH); + b3Assert(m_parallelLinearBvhProgram); + + m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_separateAabbsKernel); + m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findAllNodesMergedAabbKernel); + m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_assignMortonCodesAndAabbIndiciesKernel); + + m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_computeAdjacentPairCommonPrefixKernel); + m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeLeafNodesKernel); + m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeInternalNodesKernel); + m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findDistanceFromRootKernel); + m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel); + + m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_findLeafIndexRangesKernel); + + m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhCalculateOverlappingPairsKernel); + m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhRayTraverseKernel); + m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhLargeAabbAabbTestKernel); + m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros ); + b3Assert(m_plbvhLargeAabbRayTestKernel); +} + +b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() +{ + clReleaseKernel(m_separateAabbsKernel); + clReleaseKernel(m_findAllNodesMergedAabbKernel); + clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel); + + clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel); + clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel); + clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel); + clReleaseKernel(m_findDistanceFromRootKernel); + clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel); + + clReleaseKernel(m_findLeafIndexRangesKernel); + + clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel); + clReleaseKernel(m_plbvhRayTraverseKernel); + clReleaseKernel(m_plbvhLargeAabbAabbTestKernel); + clReleaseKernel(m_plbvhLargeAabbRayTestKernel); + + clReleaseProgram(m_parallelLinearBvhProgram); +} + +void b3GpuParallelLinearBvh::build(const b3OpenCLArray& worldSpaceAabbs, const b3OpenCLArray& smallAabbIndices, + const b3OpenCLArray& largeAabbIndices) +{ + B3_PROFILE("b3ParallelLinearBvh::build()"); + + int numLargeAabbs = largeAabbIndices.size(); + int numSmallAabbs = smallAabbIndices.size(); + + //Since all AABBs(both large and small) are input as a contiguous array, + //with 2 additional arrays used to indicate the indices of large and small AABBs, + //it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH. + { + B3_PROFILE("Separate large and small AABBs"); + + m_largeAabbs.resize(numLargeAabbs); + m_leafNodeAabbs.resize(numSmallAabbs); + + //Write large AABBs into m_largeAabbs + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( largeAabbIndices.getBufferCL() ), + + b3BufferInfoCL( m_largeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLargeAabbs); + + launcher.launch1D(numLargeAabbs); + } + + //Write small AABBs into m_leafNodeAabbs + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ), + b3BufferInfoCL( smallAabbIndices.getBufferCL() ), + + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numSmallAabbs); + + launcher.launch1D(numSmallAabbs); + } + + clFinish(m_queue); + } + + // + int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs + int numInternalNodes = numLeaves - 1; + + if(numLeaves < 2) + { + //Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(), + //so it does not matter if numLeaves == 0 and rootNodeIndex == -1 + int rootNodeIndex = numLeaves - 1; + m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1); + + //Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm, + //m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index + //instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work. + //( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs ) + if(numLeaves == 1) + { + b3SortData leaf; + leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set + + m_mortonCodesAndAabbIndicies.resize(1); + m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1); + } + + return; + } + + // + { + m_internalNodeAabbs.resize(numInternalNodes); + m_internalNodeLeafIndexRanges.resize(numInternalNodes); + m_internalNodeChildNodes.resize(numInternalNodes); + m_internalNodeParentNodes.resize(numInternalNodes); + + m_commonPrefixes.resize(numInternalNodes); + m_commonPrefixLengths.resize(numInternalNodes); + m_distanceFromRoot.resize(numInternalNodes); + + m_leafNodeParentNodes.resize(numLeaves); + m_mortonCodesAndAabbIndicies.resize(numLeaves); + m_mergedAabb.resize(numLeaves); + } + + //Find the merged AABB of all small AABBs; this is used to define the size of + //each cell in the virtual grid for the next kernel(2^10 cells in each dimension). + { + B3_PROFILE("Find AABB of merged nodes"); + + m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array + + for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; + numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0] + }; + + b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numAabbsNeedingMerge); + + launcher.launch1D(numAabbsNeedingMerge); + } + + clFinish(m_queue); + } + + //Insert the center of the AABBs into a virtual grid, + //then convert the discrete grid coordinates into a morton code + //For each element in m_mortonCodesAndAabbIndicies, set + // m_key == morton code (value to sort by) + // m_value == small AABB index + { + B3_PROFILE("Assign morton codes"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_mergedAabb.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + // + { + B3_PROFILE("Sort leaves by morton codes"); + + m_radixSorter.execute(m_mortonCodesAndAabbIndicies); + clFinish(m_queue); + } + + // + constructBinaryRadixTree(); + + + //Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices. + //The root node contains leaf node indices in the range [0, numLeafNodes - 1]. + //The child nodes of each node split their parent's index range into 2 contiguous halves. + // + //For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31]. + //The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31]. + // + //This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice + { + B3_PROFILE("m_findLeafIndexRangesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } +} + +void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray& out_overlappingPairs) +{ + int maxPairs = out_overlappingPairs.size(); + b3OpenCLArray& numPairsGpu = m_temp; + + int reset = 0; + numPairsGpu.copyFromHostPointer(&reset, 1); + + // + if( m_leafNodeAabbs.size() > 1 ) + { + B3_PROFILE("PLBVH small-small AABB test"); + + int numQueryAabbs = m_leafNodeAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( numPairsGpu.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + int numLargeAabbRigids = m_largeAabbs.size(); + if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 ) + { + B3_PROFILE("PLBVH large-small AABB test"); + + int numQueryAabbs = m_leafNodeAabbs.size(); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_largeAabbs.getBufferCL() ), + + b3BufferInfoCL( numPairsGpu.getBufferCL() ), + b3BufferInfoCL( out_overlappingPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxPairs); + launcher.setConst(numLargeAabbRigids); + launcher.setConst(numQueryAabbs); + + launcher.launch1D(numQueryAabbs); + clFinish(m_queue); + } + + + // + int numPairs = -1; + numPairsGpu.copyToHostPointer(&numPairs, 1); + if(numPairs > maxPairs) + { + b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs); + numPairs = maxPairs; + numPairsGpu.copyFromHostPointer(&maxPairs, 1); + } + + out_overlappingPairs.resize(numPairs); +} + + +void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray& rays, + b3OpenCLArray& out_numRayRigidPairs, b3OpenCLArray& out_rayRigidPairs) +{ + B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()"); + + int numRays = rays.size(); + int maxRayRigidPairs = out_rayRigidPairs.size(); + + int reset = 0; + out_numRayRigidPairs.copyFromHostPointer(&reset, 1); + + // + if( m_leafNodeAabbs.size() > 0 ) + { + B3_PROFILE("PLBVH ray test small AABB"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + + b3BufferInfoCL( rays.getBufferCL() ), + + b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), + b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxRayRigidPairs); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_queue); + } + + int numLargeAabbRigids = m_largeAabbs.size(); + if(numLargeAabbRigids > 0) + { + B3_PROFILE("PLBVH ray test large AABB"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_largeAabbs.getBufferCL() ), + b3BufferInfoCL( rays.getBufferCL() ), + + b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ), + b3BufferInfoCL( out_rayRigidPairs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLargeAabbRigids); + launcher.setConst(maxRayRigidPairs); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_queue); + } + + // + int numRayRigidPairs = -1; + out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1); + + if(numRayRigidPairs > maxRayRigidPairs) + b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs); + +} + +void b3GpuParallelLinearBvh::constructBinaryRadixTree() +{ + B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()"); + + int numLeaves = m_leafNodeAabbs.size(); + int numInternalNodes = numLeaves - 1; + + //Each internal node is placed in between 2 leaf nodes. + //By using this arrangement and computing the common prefix between + //these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree. + { + B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //For each leaf node, select its parent node by + //comparing the 2 nearest internal nodes and assign child node indices + { + B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numLeaves); + + launcher.launch1D(numLeaves); + clFinish(m_queue); + } + + //For each internal node, perform 2 binary searches among the other internal nodes + //to its left and right to find its potential parent nodes and assign child node indices + { + B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_commonPrefixes.getBufferCL() ), + b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //Find the number of nodes seperating each internal node and the root node + //so that the AABBs can be set using the next kernel. + //Also determine the maximum number of nodes separating an internal node and the root node. + { + B3_PROFILE("m_findDistanceFromRootKernel"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ), + b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ), + b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numInternalNodes); + + launcher.launch1D(numInternalNodes); + clFinish(m_queue); + } + + //Starting from the internal nodes nearest to the leaf nodes, recursively move up + //the tree towards the root to set the AABBs of each internal node; each internal node + //checks its children and merges their AABBs + { + B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel"); + + int maxDistanceFromRoot = -1; + { + B3_PROFILE("copy maxDistanceFromRoot to CPU"); + m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1); + clFinish(m_queue); + } + + for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot) + { + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ), + b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ), + b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ), + b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ), + b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ) + }; + + b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(maxDistanceFromRoot); + launcher.setConst(distanceFromRoot); + launcher.setConst(numInternalNodes); + + //It may seem inefficent to launch a thread for each internal node when a + //much smaller number of nodes is actually processed, but this is actually + //faster than determining the exact nodes that are ready to merge their child AABBs. + launcher.launch1D(numInternalNodes); + } + + clFinish(m_queue); + } +} + + \ No newline at end of file diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h new file mode 100644 index 000000000..effe617b7 --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h @@ -0,0 +1,125 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H +#define B3_GPU_PARALLEL_LINEAR_BVH_H + +//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h" +#include "Bullet3Common/shared/b3Int2.h" +#include "Bullet3Common/shared/b3Int4.h" +#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h" + +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h" + +#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" + +#define b3Int64 cl_long + +///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame +///@remarks +///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details. +///@par +///Related papers: \n +///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n +///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n +///@par +///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages: +/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) +/// - [fully parallel] Sort morton codes +/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) +/// - [somewhat parallel] Set internal node AABBs +///@par +///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages. +///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree. +///Instead of searching for the child nodes of each internal node, we search for the parent node of each node. +///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs. +class b3GpuParallelLinearBvh +{ + cl_command_queue m_queue; + + cl_program m_parallelLinearBvhProgram; + + cl_kernel m_separateAabbsKernel; + cl_kernel m_findAllNodesMergedAabbKernel; + cl_kernel m_assignMortonCodesAndAabbIndiciesKernel; + + //Binary radix tree construction kernels + cl_kernel m_computeAdjacentPairCommonPrefixKernel; + cl_kernel m_buildBinaryRadixTreeLeafNodesKernel; + cl_kernel m_buildBinaryRadixTreeInternalNodesKernel; + cl_kernel m_findDistanceFromRootKernel; + cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel; + + cl_kernel m_findLeafIndexRangesKernel; + + //Traversal kernels + cl_kernel m_plbvhCalculateOverlappingPairsKernel; + cl_kernel m_plbvhRayTraverseKernel; + cl_kernel m_plbvhLargeAabbAabbTestKernel; + cl_kernel m_plbvhLargeAabbRayTestKernel; + + b3RadixSort32CL m_radixSorter; + + //1 element + b3OpenCLArray m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node + b3OpenCLArray m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node + b3OpenCLArray m_temp; //Used to hold the number of pairs in calculateOverlappingPairs() + + //1 element per internal node (number_of_internal_nodes == number_of_leaves - 1) + b3OpenCLArray m_internalNodeAabbs; + b3OpenCLArray m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index + b3OpenCLArray m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node + b3OpenCLArray m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + + //1 element per internal node; for binary radix tree construction + b3OpenCLArray m_commonPrefixes; + b3OpenCLArray m_commonPrefixLengths; + b3OpenCLArray m_distanceFromRoot; //Number of internal nodes between this node and the root + + //1 element per leaf node (leaf nodes only include small AABBs) + b3OpenCLArray m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal + b3OpenCLArray m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs + b3OpenCLArray m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes + b3OpenCLArray m_leafNodeAabbs; //Contains only small AABBs + + //1 element per large AABB, which is not stored in the BVH + b3OpenCLArray m_largeAabbs; + +public: + b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue); + virtual ~b3GpuParallelLinearBvh(); + + ///Must be called before any other function + void build(const b3OpenCLArray& worldSpaceAabbs, const b3OpenCLArray& smallAabbIndices, + const b3OpenCLArray& largeAabbIndices); + + ///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs. + ///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs. + ///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized. + void calculateOverlappingPairs(b3OpenCLArray& out_overlappingPairs); + + ///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections; + ///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough. + ///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index. + ///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded. + void testRaysAgainstBvhAabbs(const b3OpenCLArray& rays, + b3OpenCLArray& out_numRayRigidPairs, b3OpenCLArray& out_rayRigidPairs); + +private: + void constructBinaryRadixTree(); +}; + +#endif diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp new file mode 100644 index 000000000..78b625631 --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp @@ -0,0 +1,80 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#include "b3GpuParallelLinearBvhBroadphase.h" + +b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : + m_plbvh(context, device, queue), + + m_overlappingPairsGpu(context, queue), + + m_aabbsGpu(context, queue), + m_smallAabbsMappingGpu(context, queue), + m_largeAabbsMappingGpu(context, queue) +{ +} + +void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) +{ + int newAabbIndex = m_aabbsCpu.size(); + + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = newAabbIndex; + + m_smallAabbsMappingCpu.push_back(newAabbIndex); + + m_aabbsCpu.push_back(aabb); +} +void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask) +{ + int newAabbIndex = m_aabbsCpu.size(); + + b3SapAabb aabb; + aabb.m_minVec = aabbMin; + aabb.m_maxVec = aabbMax; + + aabb.m_minIndices[3] = userPtr; + aabb.m_signedMaxIndices[3] = newAabbIndex; + + m_largeAabbsMappingCpu.push_back(newAabbIndex); + + m_aabbsCpu.push_back(aabb); +} + +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs) +{ + //Reconstruct BVH + m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu); + + // + m_overlappingPairsGpu.resize(maxPairs); + m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu); +} +void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs) +{ + b3Assert(0); //CPU version not implemented +} + +void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() +{ + m_aabbsGpu.copyFromHost(m_aabbsCpu); + m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu); + m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu); +} + + + diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h new file mode 100644 index 000000000..284f6c780 --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h @@ -0,0 +1,66 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H +#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H + +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h" + +#include "b3GpuParallelLinearBvh.h" + +class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface +{ + b3GpuParallelLinearBvh m_plbvh; + + b3OpenCLArray m_overlappingPairsGpu; + + b3OpenCLArray m_aabbsGpu; + b3OpenCLArray m_smallAabbsMappingGpu; + b3OpenCLArray m_largeAabbsMappingGpu; + + b3AlignedObjectArray m_aabbsCpu; + b3AlignedObjectArray m_smallAabbsMappingCpu; + b3AlignedObjectArray m_largeAabbsMappingCpu; + +public: + b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue); + virtual ~b3GpuParallelLinearBvhBroadphase() {} + + virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask); + virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask); + + virtual void calculateOverlappingPairs(int maxPairs); + virtual void calculateOverlappingPairsHost(int maxPairs); + + //call writeAabbsToGpu after done making all changes (createProxy etc) + virtual void writeAabbsToGpu(); + + virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); } + virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); } + + virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); } + virtual b3OpenCLArray& getAllAabbsGPU() { return m_aabbsGpu; } + + virtual b3OpenCLArray& getOverlappingPairsGPU() { return m_overlappingPairsGpu; } + virtual b3OpenCLArray& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; } + virtual b3OpenCLArray& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; } + + virtual b3AlignedObjectArray& getAllAabbsCPU() { return m_aabbsCpu; } + + static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue) + { + return new b3GpuParallelLinearBvhBroadphase(context, device, queue); + } +}; + +#endif diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp index 529951af0..280503b34 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -1307,3 +1307,16 @@ cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer() { return m_overlappingPairs.getBufferCL(); } + +b3OpenCLArray& b3GpuSapBroadphase::getOverlappingPairsGPU() +{ + return m_overlappingPairs; +} +b3OpenCLArray& b3GpuSapBroadphase::getSmallAabbIndicesGPU() +{ + return m_smallAabbsMappingGPU; +} +b3OpenCLArray& b3GpuSapBroadphase::getLargeAabbIndicesGPU() +{ + return m_largeAabbsMappingGPU; +} diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h index 2d3d39367..23e4d624d 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h @@ -143,6 +143,9 @@ public: virtual int getNumOverlap(); virtual cl_mem getOverlappingPairBuffer(); + virtual b3OpenCLArray& getOverlappingPairsGPU(); + virtual b3OpenCLArray& getSmallAabbIndicesGPU(); + virtual b3OpenCLArray& getLargeAabbIndicesGPU(); }; #endif //B3_GPU_SAP_BROADPHASE_H \ No newline at end of file diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl new file mode 100644 index 000000000..586bb8abb --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl @@ -0,0 +1,767 @@ +/* +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the use of this software. +Permission is granted to anyone to use this software for any purpose, +including commercial applications, and to alter it and redistribute it freely, +subject to the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. +2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. +3. This notice may not be removed or altered from any source distribution. +*/ +//Initial Author Jackson Lee, 2014 + +typedef float b3Scalar; +typedef float4 b3Vector3; +#define b3Max max +#define b3Min min +#define b3Sqrt sqrt + +typedef struct +{ + unsigned int m_key; + unsigned int m_value; +} SortDataCL; + +typedef struct +{ + union + { + float4 m_min; + float m_minElems[4]; + int m_minIndices[4]; + }; + union + { + float4 m_max; + float m_maxElems[4]; + int m_maxIndices[4]; + }; +} b3AabbCL; + + +unsigned int interleaveBits(unsigned int x) +{ + //........ ........ ......12 3456789A //x + //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits + + //......12 3456789A ......12 3456789A //x ^ (x << 16) + //11111111 ........ ........ 11111111 //0x FF 00 00 FF + //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF; + + //......12 ........ 3456789A 3456789A //x ^ (x << 8) + //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F + //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F; + + //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4) + //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3 + //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3; + + //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2) + //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49 + //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249; + + //........ ........ ......11 11111111 //0x000003FF + x &= 0x000003FF; //Clear all bits above bit 10 + + x = (x ^ (x << 16)) & 0xFF0000FF; + x = (x ^ (x << 8)) & 0x0300F00F; + x = (x ^ (x << 4)) & 0x030C30C3; + x = (x ^ (x << 2)) & 0x09249249; + + return x; +} +unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z) +{ + return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2; +} + +__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate) +{ + int separatedAabbIndex = get_global_id(0); + if(separatedAabbIndex >= numAabbsToSeparate) return; + + int unseparatedAabbIndex = aabbIndices[separatedAabbIndex]; + out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex]; +} + +//Should replace with an optimized parallel reduction +__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge) +{ + //Each time this kernel is added to the command queue, + //the number of AABBs needing to be merged is halved + // + //Example with 159 AABBs: + // numRemainingAabbs == 159 / 2 + 159 % 2 == 80 + // numMergedAabbs == 159 - 80 == 79 + //So, indices [0, 78] are merged with [0 + 80, 78 + 80] + + int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2; + int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs; + + int aabbIndex = get_global_id(0); + if(aabbIndex >= numMergedAabbs) return; + + int otherAabbIndex = aabbIndex + numRemainingAabbs; + + b3AabbCL aabb = out_mergedAabb[aabbIndex]; + b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min); + mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max); + out_mergedAabb[aabbIndex] = mergedAabb; +} + +__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, + __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs) +{ + int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index + if(leafNodeIndex >= numAabbs) return; + + b3AabbCL mergedAabb = mergedAabbOfAllNodes[0]; + b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f; + b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024; + + b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex]; + b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f; + b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter; + + //Quantize into integer coordinates + //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size + b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize; + + int4 discretePosition; + discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) ); + discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) ); + discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) ); + + //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023] + discretePosition = b3Max( -512, b3Min(discretePosition, 511) ); + discretePosition += 512; + + //Interleave bits(assign a morton code, also known as a z-curve) + unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z); + + // + SortDataCL mortonCodeIndexPair; + mortonCodeIndexPair.m_key = mortonCode; + mortonCodeIndexPair.m_value = leafNodeIndex; + + out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair; +} + +#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128 + +//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes. +//If it is set, then the index is for an internal node; otherwise, it is a leaf node. +//In both cases, the bit should be cleared to access the actual node index. +int isLeafNode(int index) { return (index >> 31 == 0); } +int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); } +int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); } + +//From sap.cl +#define NEW_PAIR_MARKER -1 + +bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2) +{ + bool overlap = true; + overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap; + overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap; + overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap; + return overlap; +} +//From sap.cl + +__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, + + __global int* rootNodeIndex, + __global int2* internalNodeChildIndices, + __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeLeafIndexRanges, + + __global SortDataCL* mortonCodesAndAabbIndices, + __global int* out_numPairs, __global int4* out_overlappingPairs, + int maxPairs, int numQueryAabbs) +{ + //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since + //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent) + int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(queryBvhNodeIndex >= numQueryAabbs) return; + + int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value; + b3AabbCL queryAabb = rigidAabbs[queryRigidIndex]; + + int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE]; + + int stackSize = 1; + stack[0] = *rootNodeIndex; + + while(stackSize) + { + int internalOrLeafNodeIndex = stack[ stackSize - 1 ]; + --stackSize; + + int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false + int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); + + //Optimization - if the BVH is structured as a binary radix tree, then + //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]). + //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself. + { + int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y; + if(highestLeafIndex <= queryBvhNodeIndex) continue; + } + + //bvhRigidIndex is not used if internal node + int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; + + b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex]; + if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) ) + { + if(isLeaf) + { + int4 pair; + pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3]; + pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3]; + pair.z = NEW_PAIR_MARKER; + pair.w = NEW_PAIR_MARKER; + + int pairIndex = atomic_inc(out_numPairs); + if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair; + } + + if(!isLeaf) //Internal node + { + if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE) + { + //Error + } + else + { + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x; + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y; + } + } + } + + } +} + + +//From rayCastKernels.cl +typedef struct +{ + float4 m_from; + float4 m_to; +} b3RayInfo; +//From rayCastKernels.cl + +b3Vector3 b3Vector3_normalize(b3Vector3 v) +{ + b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f}; + return normalize(normal); //OpenCL normalize == vector4 normalize +} +b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; } +b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; } + +int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb) +{ + //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ). + //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane. + // + //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane + //and min.x will be the far plane; otherwise, it is reversed. + // + //In order for there to be a collision, the t_min and t_max of each pair must overlap. + //This can be tested for by selecting the highest t_min and lowest t_max and comparing them. + + int4 isNegative = isless( rayNormalizedDirection, (b3Vector3){0.0f, 0.0f, 0.0f, 0.0f} ); //isless(x,y) returns (x < y) + + //When using vector types, the select() function checks the most signficant bit, + //but isless() sets the least significant bit. + isNegative <<= 31; + + //select(b, a, condition) == condition ? a : b + //When using select() with vector types, (condition[i]) is true if its most significant bit is 1 + b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection; + b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection; + + b3Scalar t_min_final = 0.0f; + b3Scalar t_max_final = rayLength; + + //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. + //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4]) + //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN. + t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) ); + t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) ); + + return (t_min_final <= t_max_final); +} + +__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs, + + __global int* rootNodeIndex, + __global int2* internalNodeChildIndices, + __global b3AabbCL* internalNodeAabbs, + __global int2* internalNodeLeafIndexRanges, + __global SortDataCL* mortonCodesAndAabbIndices, + + __global b3RayInfo* rays, + + __global int* out_numRayRigidPairs, + __global int2* out_rayRigidPairs, + int maxRayRigidPairs, int numRays) +{ + int rayIndex = get_global_id(0); + if(rayIndex >= numRays) return; + + // + b3Vector3 rayFrom = rays[rayIndex].m_from; + b3Vector3 rayTo = rays[rayIndex].m_to; + b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom); + b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) ); + + // + int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE]; + + int stackSize = 1; + stack[0] = *rootNodeIndex; + + while(stackSize) + { + int internalOrLeafNodeIndex = stack[ stackSize - 1 ]; + --stackSize; + + int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false + int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex); + + //bvhRigidIndex is not used if internal node + int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1; + + b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex]; + if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) ) + { + if(isLeaf) + { + int2 rayRigidPair; + rayRigidPair.x = rayIndex; + rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3]; + + int pairIndex = atomic_inc(out_numRayRigidPairs); + if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair; + } + + if(!isLeaf) //Internal node + { + if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE) + { + //Error + } + else + { + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x; + stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y; + } + } + } + } +} + +__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, + __global int* out_numPairs, __global int4* out_overlappingPairs, + int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids) +{ + int smallAabbIndex = get_global_id(0); + if(smallAabbIndex >= numSmallAabbRigids) return; + + b3AabbCL smallAabb = smallAabbs[smallAabbIndex]; + for(int i = 0; i < numLargeAabbRigids; ++i) + { + b3AabbCL largeAabb = largeAabbs[i]; + if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) ) + { + int4 pair; + pair.x = largeAabb.m_minIndices[3]; + pair.y = smallAabb.m_minIndices[3]; + pair.z = NEW_PAIR_MARKER; + pair.w = NEW_PAIR_MARKER; + + int pairIndex = atomic_inc(out_numPairs); + if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair; + } + } +} +__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays, + __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs, + int numLargeAabbRigids, int maxRayRigidPairs, int numRays) +{ + int rayIndex = get_global_id(0); + if(rayIndex >= numRays) return; + + b3Vector3 rayFrom = rays[rayIndex].m_from; + b3Vector3 rayTo = rays[rayIndex].m_to; + b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom); + b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) ); + + for(int i = 0; i < numLargeAabbRigids; ++i) + { + b3AabbCL rigidAabb = largeRigidAabbs[i]; + if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) ) + { + int2 rayRigidPair; + rayRigidPair.x = rayIndex; + rayRigidPair.y = rigidAabb.m_minIndices[3]; + + int pairIndex = atomic_inc(out_numRayRigidPairs); + if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair; + } + } +} + + +//Set so that it is always greater than the actual common prefixes, and never selected as a parent node. +//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve. +//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node. +//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve). +#define B3_PLBVH_INVALID_COMMON_PREFIX 128 + +#define B3_PLBVH_ROOT_NODE_MARKER -1 + +#define b3Int64 long + +int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); } +b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) +{ + //This function only needs to return (i & j) in order for the algorithm to work, + //but it may help with debugging to mask out the lower bits. + + b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j); + + b3Int64 sharedBits = i & j; + b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0 + + return sharedBits & bitmask; +} + +//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths +int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB) +{ + return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) ); +} + +__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices, + __global b3Int64* out_commonPrefixes, + __global int* out_commonPrefixLengths, + int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if (internalNodeIndex >= numInternalNodes) return; + + //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index, + //and the number of internal nodes is always numLeafNodes - 1 + int leftLeafIndex = internalNodeIndex; + int rightLeafIndex = internalNodeIndex + 1; + + int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key; + int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key; + + //Binary radix tree construction algorithm does not work if there are duplicate morton codes. + //Append the index of each leaf node to each morton code so that there are no duplicates. + //The algorithm also requires that the morton codes are sorted in ascending order; this requirement + //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true. + // + //upsample(a, b) == ( ((b3Int64)a) << 32) | b + b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex); + b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex); + + out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); + out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode); +} + + +__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes, + __global int2* out_childNodes, int numLeafNodes) +{ + int leafNodeIndex = get_global_id(0); + if (leafNodeIndex >= numLeafNodes) return; + + int numInternalNodes = numLeafNodes - 1; + + int leftSplitIndex = leafNodeIndex - 1; + int rightSplitIndex = leafNodeIndex; + + int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + + //Parent node is the highest adjacent common prefix that is lower than the node's common prefix + //Leaf nodes are considered as having the highest common prefix + int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix); + + //Handle cases for the edge nodes; the first and last node + //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX + if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false; + if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true; + + int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex; + out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex; + + int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa + + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child + int isLeaf = 1; + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex); +} + +__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths, + __global int2* out_childNodes, + __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex, + int numInternalNodes) +{ + int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + b3Int64 nodePrefix = commonPrefixes[internalNodeIndex]; + int nodePrefixLength = commonPrefixLengths[internalNodeIndex]; + +//#define USE_LINEAR_SEARCH +#ifdef USE_LINEAR_SEARCH + int leftIndex = -1; + int rightIndex = -1; + + //Find nearest element to left with a lower common prefix + for(int i = internalNodeIndex - 1; i >= 0; --i) + { + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + leftIndex = i; + break; + } + } + + //Find nearest element to right with a lower common prefix + for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i) + { + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + rightIndex = i; + break; + } + } + +#else //Use binary search + + //Find nearest element to left with a lower common prefix + int leftIndex = -1; + { + int lower = 0; + int upper = internalNodeIndex - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int right = mid + 1; + if(right < internalNodeIndex) + { + b3Int64 rightPrefix = commonPrefixes[right]; + int rightPrefixLength = commonPrefixLengths[right]; + + int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength); + if(nodeRightSharedPrefixLength < nodePrefixLength) + { + lower = right; + leftIndex = right; + } + else + { + leftIndex = mid; + break; + } + } + else + { + leftIndex = mid; + break; + } + } + else upper = mid - 1; + } + } + + //Find nearest element to right with a lower common prefix + int rightIndex = -1; + { + int lower = internalNodeIndex + 1; + int upper = numInternalNodes - 1; + + while(lower <= upper) + { + int mid = (lower + upper) / 2; + b3Int64 midPrefix = commonPrefixes[mid]; + int midPrefixLength = commonPrefixLengths[mid]; + + int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength); + if(nodeMidSharedPrefixLength < nodePrefixLength) + { + int left = mid - 1; + if(left > internalNodeIndex) + { + b3Int64 leftPrefix = commonPrefixes[left]; + int leftPrefixLength = commonPrefixLengths[left]; + + int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength); + if(nodeLeftSharedPrefixLength < nodePrefixLength) + { + upper = left; + rightIndex = left; + } + else + { + rightIndex = mid; + break; + } + } + else + { + rightIndex = mid; + break; + } + } + else lower = mid + 1; + } + } +#endif + + //Select parent + { + int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX; + + int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength); + + if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false; + else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true; + + int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex; + + int isRootNode = (leftIndex == -1 && rightIndex == -1); + out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER; + + int isLeaf = 0; + if(!isRootNode) + { + int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa + + //out_childNodesAsInt[0] == int2.x == left child + //out_childNodesAsInt[1] == int2.y == right child + __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]); + out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } + else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex); + } +} + +__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes, + __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes) +{ + if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0); + + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + // + int distanceFromRoot = 0; + { + int parentIndex = internalNodeParentNodes[internalNodeIndex]; + while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER) + { + parentIndex = internalNodeParentNodes[parentIndex]; + ++distanceFromRoot; + } + } + out_distanceFromRoot[internalNodeIndex] = distanceFromRoot; + + // + __local int localMaxDistanceFromRoot; + if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0; + barrier(CLK_LOCAL_MEM_FENCE); + + atomic_max(&localMaxDistanceFromRoot, distanceFromRoot); + barrier(CLK_LOCAL_MEM_FENCE); + + if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot); +} + +__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices, + __global int2* childNodes, + __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs, + int maxDistanceFromRoot, int processedDistance, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + int distance = distanceFromRoot[internalNodeIndex]; + + if(distance == processedDistance) + { + int leftChildIndex = childNodes[internalNodeIndex].x; + int rightChildIndex = childNodes[internalNodeIndex].y; + + int isLeftChildLeaf = isLeafNode(leftChildIndex); + int isRightChildLeaf = isLeafNode(rightChildIndex); + + leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex); + rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex); + + //leftRigidIndex/rightRigidIndex is not used if internal node + int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1; + int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1; + + b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex]; + b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex]; + + b3AabbCL mergedAabb; + mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min); + mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max); + internalNodeAabbs[internalNodeIndex] = mergedAabb; + } +} + +__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes) +{ + int internalNodeIndex = get_global_id(0); + if(internalNodeIndex >= numInternalNodes) return; + + int numLeafNodes = numInternalNodes + 1; + + int2 childNodes = internalNodeChildNodes[internalNodeIndex]; + + int2 leafIndexRange; //x == min leaf index, y == max leaf index + + //Find lowest leaf index covered by this internal node + { + int lowestIndex = childNodes.x; //childNodes.x == Left child + while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x; + leafIndexRange.x = lowestIndex; + } + + //Find highest leaf index covered by this internal node + { + int highestIndex = childNodes.y; //childNodes.y == Right child + while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y; + leafIndexRange.y = highestIndex; + } + + // + out_leafIndexRanges[internalNodeIndex] = leafIndexRange; +} diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h new file mode 100644 index 000000000..1b72803d3 --- /dev/null +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h @@ -0,0 +1,729 @@ +//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project +static const char* parallelLinearBvhCL= \ +"/*\n" +"This software is provided 'as-is', without any express or implied warranty.\n" +"In no event will the authors be held liable for any damages arising from the use of this software.\n" +"Permission is granted to anyone to use this software for any purpose,\n" +"including commercial applications, and to alter it and redistribute it freely,\n" +"subject to the following restrictions:\n" +"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n" +"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n" +"3. This notice may not be removed or altered from any source distribution.\n" +"*/\n" +"//Initial Author Jackson Lee, 2014\n" +"typedef float b3Scalar;\n" +"typedef float4 b3Vector3;\n" +"#define b3Max max\n" +"#define b3Min min\n" +"#define b3Sqrt sqrt\n" +"typedef struct\n" +"{\n" +" unsigned int m_key;\n" +" unsigned int m_value;\n" +"} SortDataCL;\n" +"typedef struct \n" +"{\n" +" union\n" +" {\n" +" float4 m_min;\n" +" float m_minElems[4];\n" +" int m_minIndices[4];\n" +" };\n" +" union\n" +" {\n" +" float4 m_max;\n" +" float m_maxElems[4];\n" +" int m_maxIndices[4];\n" +" };\n" +"} b3AabbCL;\n" +"unsigned int interleaveBits(unsigned int x)\n" +"{\n" +" //........ ........ ......12 3456789A //x\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n" +" \n" +" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n" +" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n" +" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n" +" \n" +" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n" +" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n" +" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n" +" \n" +" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n" +" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n" +" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n" +" \n" +" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n" +" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n" +" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n" +" \n" +" //........ ........ ......11 11111111 //0x000003FF\n" +" x &= 0x000003FF; //Clear all bits above bit 10\n" +" \n" +" x = (x ^ (x << 16)) & 0xFF0000FF;\n" +" x = (x ^ (x << 8)) & 0x0300F00F;\n" +" x = (x ^ (x << 4)) & 0x030C30C3;\n" +" x = (x ^ (x << 2)) & 0x09249249;\n" +" \n" +" return x;\n" +"}\n" +"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n" +"{\n" +" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n" +"}\n" +"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n" +"{\n" +" int separatedAabbIndex = get_global_id(0);\n" +" if(separatedAabbIndex >= numAabbsToSeparate) return;\n" +" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n" +" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n" +"}\n" +"//Should replace with an optimized parallel reduction\n" +"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n" +"{\n" +" //Each time this kernel is added to the command queue, \n" +" //the number of AABBs needing to be merged is halved\n" +" //\n" +" //Example with 159 AABBs:\n" +" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n" +" // numMergedAabbs == 159 - 80 == 79\n" +" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n" +" \n" +" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n" +" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n" +" \n" +" int aabbIndex = get_global_id(0);\n" +" if(aabbIndex >= numMergedAabbs) return;\n" +" \n" +" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n" +" \n" +" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n" +" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n" +" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n" +" out_mergedAabb[aabbIndex] = mergedAabb;\n" +"}\n" +"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n" +" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n" +"{\n" +" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n" +" if(leafNodeIndex >= numAabbs) return;\n" +" \n" +" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n" +" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n" +" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n" +" \n" +" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n" +" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n" +" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n" +" \n" +" //Quantize into integer coordinates\n" +" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n" +" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n" +" \n" +" int4 discretePosition;\n" +" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n" +" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n" +" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n" +" \n" +" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n" +" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n" +" discretePosition += 512;\n" +" \n" +" //Interleave bits(assign a morton code, also known as a z-curve)\n" +" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n" +" \n" +" //\n" +" SortDataCL mortonCodeIndexPair;\n" +" mortonCodeIndexPair.m_key = mortonCode;\n" +" mortonCodeIndexPair.m_value = leafNodeIndex;\n" +" \n" +" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n" +"}\n" +"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n" +"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n" +"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n" +"//In both cases, the bit should be cleared to access the actual node index.\n" +"int isLeafNode(int index) { return (index >> 31 == 0); }\n" +"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n" +"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n" +"//From sap.cl\n" +"#define NEW_PAIR_MARKER -1\n" +"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n" +"{\n" +" bool overlap = true;\n" +" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n" +" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n" +" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" +" return overlap;\n" +"}\n" +"//From sap.cl\n" +"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n" +" __global int* rootNodeIndex, \n" +" __global int2* internalNodeChildIndices, \n" +" __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeLeafIndexRanges,\n" +" \n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" +" int maxPairs, int numQueryAabbs)\n" +"{\n" +" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n" +" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n" +" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(queryBvhNodeIndex >= numQueryAabbs) return;\n" +" \n" +" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n" +" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n" +" \n" +" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +" \n" +" int stackSize = 1;\n" +" stack[0] = *rootNodeIndex;\n" +" \n" +" while(stackSize)\n" +" {\n" +" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +" --stackSize;\n" +" \n" +" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" +" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +" \n" +" //Optimization - if the BVH is structured as a binary radix tree, then\n" +" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n" +" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n" +" {\n" +" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n" +" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n" +" }\n" +" \n" +" //bvhRigidIndex is not used if internal node\n" +" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +" \n" +" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n" +" {\n" +" if(isLeaf)\n" +" {\n" +" int4 pair;\n" +" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n" +" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +" pair.z = NEW_PAIR_MARKER;\n" +" pair.w = NEW_PAIR_MARKER;\n" +" \n" +" int pairIndex = atomic_inc(out_numPairs);\n" +" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +" }\n" +" \n" +" if(!isLeaf) //Internal node\n" +" {\n" +" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +" {\n" +" //Error\n" +" }\n" +" else\n" +" {\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +" }\n" +" }\n" +" }\n" +" \n" +" }\n" +"}\n" +"//From rayCastKernels.cl\n" +"typedef struct\n" +"{\n" +" float4 m_from;\n" +" float4 m_to;\n" +"} b3RayInfo;\n" +"//From rayCastKernels.cl\n" +"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n" +"{\n" +" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n" +" return normalize(normal); //OpenCL normalize == vector4 normalize\n" +"}\n" +"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n" +"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n" +"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n" +"{\n" +" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n" +" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n" +" //\n" +" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n" +" //and min.x will be the far plane; otherwise, it is reversed.\n" +" //\n" +" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n" +" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n" +" \n" +" int4 isNegative = isless( rayNormalizedDirection, (b3Vector3){0.0f, 0.0f, 0.0f, 0.0f} ); //isless(x,y) returns (x < y)\n" +" \n" +" //When using vector types, the select() function checks the most signficant bit, \n" +" //but isless() sets the least significant bit.\n" +" isNegative <<= 31;\n" +" //select(b, a, condition) == condition ? a : b\n" +" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n" +" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n" +" \n" +" b3Scalar t_min_final = 0.0f;\n" +" b3Scalar t_max_final = rayLength;\n" +" \n" +" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n" +" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n" +" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n" +" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n" +" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n" +" \n" +" return (t_min_final <= t_max_final);\n" +"}\n" +"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n" +" __global int* rootNodeIndex, \n" +" __global int2* internalNodeChildIndices, \n" +" __global b3AabbCL* internalNodeAabbs,\n" +" __global int2* internalNodeLeafIndexRanges,\n" +" __global SortDataCL* mortonCodesAndAabbIndices,\n" +" \n" +" __global b3RayInfo* rays,\n" +" \n" +" __global int* out_numRayRigidPairs, \n" +" __global int2* out_rayRigidPairs,\n" +" int maxRayRigidPairs, int numRays)\n" +"{\n" +" int rayIndex = get_global_id(0);\n" +" if(rayIndex >= numRays) return;\n" +" \n" +" //\n" +" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +" b3Vector3 rayTo = rays[rayIndex].m_to;\n" +" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +" \n" +" //\n" +" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n" +" \n" +" int stackSize = 1;\n" +" stack[0] = *rootNodeIndex;\n" +" \n" +" while(stackSize)\n" +" {\n" +" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n" +" --stackSize;\n" +" \n" +" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n" +" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n" +" \n" +" //bvhRigidIndex is not used if internal node\n" +" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n" +" \n" +" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n" +" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n" +" {\n" +" if(isLeaf)\n" +" {\n" +" int2 rayRigidPair;\n" +" rayRigidPair.x = rayIndex;\n" +" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n" +" \n" +" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +" }\n" +" \n" +" if(!isLeaf) //Internal node\n" +" {\n" +" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n" +" {\n" +" //Error\n" +" }\n" +" else\n" +" {\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n" +" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n" +" }\n" +" }\n" +" }\n" +" }\n" +"}\n" +"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n" +" __global int* out_numPairs, __global int4* out_overlappingPairs, \n" +" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n" +"{\n" +" int smallAabbIndex = get_global_id(0);\n" +" if(smallAabbIndex >= numSmallAabbRigids) return;\n" +" \n" +" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n" +" for(int i = 0; i < numLargeAabbRigids; ++i)\n" +" {\n" +" b3AabbCL largeAabb = largeAabbs[i];\n" +" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n" +" {\n" +" int4 pair;\n" +" pair.x = largeAabb.m_minIndices[3];\n" +" pair.y = smallAabb.m_minIndices[3];\n" +" pair.z = NEW_PAIR_MARKER;\n" +" pair.w = NEW_PAIR_MARKER;\n" +" \n" +" int pairIndex = atomic_inc(out_numPairs);\n" +" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n" +" }\n" +" }\n" +"}\n" +"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n" +" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n" +" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n" +"{\n" +" int rayIndex = get_global_id(0);\n" +" if(rayIndex >= numRays) return;\n" +" \n" +" b3Vector3 rayFrom = rays[rayIndex].m_from;\n" +" b3Vector3 rayTo = rays[rayIndex].m_to;\n" +" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n" +" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n" +" \n" +" for(int i = 0; i < numLargeAabbRigids; ++i)\n" +" {\n" +" b3AabbCL rigidAabb = largeRigidAabbs[i];\n" +" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n" +" {\n" +" int2 rayRigidPair;\n" +" rayRigidPair.x = rayIndex;\n" +" rayRigidPair.y = rigidAabb.m_minIndices[3];\n" +" \n" +" int pairIndex = atomic_inc(out_numRayRigidPairs);\n" +" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n" +" }\n" +" }\n" +"}\n" +"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n" +"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n" +"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n" +"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n" +"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n" +"#define B3_PLBVH_ROOT_NODE_MARKER -1\n" +"#define b3Int64 long\n" +"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n" +"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n" +"{\n" +" //This function only needs to return (i & j) in order for the algorithm to work,\n" +" //but it may help with debugging to mask out the lower bits.\n" +" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n" +" b3Int64 sharedBits = i & j;\n" +" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n" +" \n" +" return sharedBits & bitmask;\n" +"}\n" +"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n" +"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n" +"{\n" +" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n" +"}\n" +"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global b3Int64* out_commonPrefixes,\n" +" __global int* out_commonPrefixLengths,\n" +" int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if (internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n" +" //and the number of internal nodes is always numLeafNodes - 1\n" +" int leftLeafIndex = internalNodeIndex;\n" +" int rightLeafIndex = internalNodeIndex + 1;\n" +" \n" +" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n" +" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n" +" \n" +" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n" +" //Append the index of each leaf node to each morton code so that there are no duplicates.\n" +" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n" +" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n" +" //\n" +" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n" +" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n" +" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n" +" \n" +" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n" +"}\n" +"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n" +" __global int2* out_childNodes, int numLeafNodes)\n" +"{\n" +" int leafNodeIndex = get_global_id(0);\n" +" if (leafNodeIndex >= numLeafNodes) return;\n" +" \n" +" int numInternalNodes = numLeafNodes - 1;\n" +" \n" +" int leftSplitIndex = leafNodeIndex - 1;\n" +" int rightSplitIndex = leafNodeIndex;\n" +" \n" +" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" \n" +" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n" +" //Leaf nodes are considered as having the highest common prefix\n" +" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n" +" \n" +" //Handle cases for the edge nodes; the first and last node\n" +" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n" +" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n" +" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n" +" \n" +" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n" +" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n" +" \n" +" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n" +" \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" +" int isLeaf = 1;\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n" +"}\n" +"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n" +" __global int2* out_childNodes,\n" +" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n" +" int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n" +" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n" +" \n" +"//#define USE_LINEAR_SEARCH\n" +"#ifdef USE_LINEAR_SEARCH\n" +" int leftIndex = -1;\n" +" int rightIndex = -1;\n" +" \n" +" //Find nearest element to left with a lower common prefix\n" +" for(int i = internalNodeIndex - 1; i >= 0; --i)\n" +" {\n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" leftIndex = i;\n" +" break;\n" +" }\n" +" }\n" +" \n" +" //Find nearest element to right with a lower common prefix\n" +" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n" +" {\n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength)\n" +" {\n" +" rightIndex = i;\n" +" break;\n" +" }\n" +" }\n" +" \n" +"#else //Use binary search\n" +" //Find nearest element to left with a lower common prefix\n" +" int leftIndex = -1;\n" +" {\n" +" int lower = 0;\n" +" int upper = internalNodeIndex - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int right = mid + 1;\n" +" if(right < internalNodeIndex)\n" +" {\n" +" b3Int64 rightPrefix = commonPrefixes[right];\n" +" int rightPrefixLength = commonPrefixLengths[right];\n" +" \n" +" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n" +" if(nodeRightSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" lower = right;\n" +" leftIndex = right;\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" leftIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else upper = mid - 1;\n" +" }\n" +" }\n" +" \n" +" //Find nearest element to right with a lower common prefix\n" +" int rightIndex = -1;\n" +" {\n" +" int lower = internalNodeIndex + 1;\n" +" int upper = numInternalNodes - 1;\n" +" \n" +" while(lower <= upper)\n" +" {\n" +" int mid = (lower + upper) / 2;\n" +" b3Int64 midPrefix = commonPrefixes[mid];\n" +" int midPrefixLength = commonPrefixLengths[mid];\n" +" \n" +" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n" +" if(nodeMidSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" int left = mid - 1;\n" +" if(left > internalNodeIndex)\n" +" {\n" +" b3Int64 leftPrefix = commonPrefixes[left];\n" +" int leftPrefixLength = commonPrefixLengths[left];\n" +" \n" +" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n" +" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n" +" {\n" +" upper = left;\n" +" rightIndex = left;\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else \n" +" {\n" +" rightIndex = mid;\n" +" break;\n" +" }\n" +" }\n" +" else lower = mid + 1;\n" +" }\n" +" }\n" +"#endif\n" +" \n" +" //Select parent\n" +" {\n" +" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n" +" \n" +" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n" +" \n" +" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n" +" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n" +" \n" +" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n" +" \n" +" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n" +" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n" +" \n" +" int isLeaf = 0;\n" +" if(!isRootNode)\n" +" {\n" +" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n" +" \n" +" //out_childNodesAsInt[0] == int2.x == left child\n" +" //out_childNodesAsInt[1] == int2.y == right child\n" +" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n" +" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n" +" }\n" +"}\n" +"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n" +" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n" +"{\n" +" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" //\n" +" int distanceFromRoot = 0;\n" +" {\n" +" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n" +" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n" +" {\n" +" parentIndex = internalNodeParentNodes[parentIndex];\n" +" ++distanceFromRoot;\n" +" }\n" +" }\n" +" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n" +" \n" +" //\n" +" __local int localMaxDistanceFromRoot;\n" +" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n" +" barrier(CLK_LOCAL_MEM_FENCE);\n" +" \n" +" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n" +"}\n" +"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n" +" __global int2* childNodes,\n" +" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n" +" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" int distance = distanceFromRoot[internalNodeIndex];\n" +" \n" +" if(distance == processedDistance)\n" +" {\n" +" int leftChildIndex = childNodes[internalNodeIndex].x;\n" +" int rightChildIndex = childNodes[internalNodeIndex].y;\n" +" \n" +" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n" +" int isRightChildLeaf = isLeafNode(rightChildIndex);\n" +" \n" +" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n" +" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n" +" \n" +" //leftRigidIndex/rightRigidIndex is not used if internal node\n" +" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n" +" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n" +" \n" +" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n" +" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n" +" \n" +" b3AabbCL mergedAabb;\n" +" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n" +" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n" +" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n" +" }\n" +"}\n" +"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n" +"{\n" +" int internalNodeIndex = get_global_id(0);\n" +" if(internalNodeIndex >= numInternalNodes) return;\n" +" \n" +" int numLeafNodes = numInternalNodes + 1;\n" +" \n" +" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n" +" \n" +" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n" +" \n" +" //Find lowest leaf index covered by this internal node\n" +" {\n" +" int lowestIndex = childNodes.x; //childNodes.x == Left child\n" +" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n" +" leafIndexRange.x = lowestIndex;\n" +" }\n" +" \n" +" //Find highest leaf index covered by this internal node\n" +" {\n" +" int highestIndex = childNodes.y; //childNodes.y == Right child\n" +" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n" +" leafIndexRange.y = highestIndex;\n" +" }\n" +" \n" +" //\n" +" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n" +"}\n" +; diff --git a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp index cb57fbd5b..4ef38bd1d 100644 --- a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp +++ b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp @@ -8,6 +8,11 @@ #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h" #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h" #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" +#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h" +#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h" + #include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h" @@ -20,7 +25,24 @@ struct b3GpuRaycastInternalData cl_context m_context; cl_device_id m_device; cl_command_queue m_q; - cl_kernel m_raytraceKernel; + cl_kernel m_raytraceKernel; + cl_kernel m_raytracePairsKernel; + cl_kernel m_findRayRigidPairIndexRanges; + + b3GpuParallelLinearBvh* m_plbvh; + b3RadixSort32CL* m_radixSorter; + b3FillCL* m_fill; + + //1 element per ray + b3OpenCLArray* m_gpuRays; + b3OpenCLArray* m_gpuHitResults; + b3OpenCLArray* m_firstRayRigidPairIndexPerRay; + b3OpenCLArray* m_numRayRigidPairsPerRay; + + //1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB + b3OpenCLArray* m_gpuNumRayRigidPairs; + b3OpenCLArray* m_gpuRayRigidPairs; //x == ray index, y == rigid index + int m_test; }; @@ -31,7 +53,19 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue m_data->m_device = device; m_data->m_q = q; m_data->m_raytraceKernel = 0; + m_data->m_raytracePairsKernel = 0; + m_data->m_findRayRigidPairIndexRanges = 0; + m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q); + m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q); + m_data->m_fill = new b3FillCL(ctx, device, q); + + m_data->m_gpuRays = new b3OpenCLArray(ctx, q); + m_data->m_gpuHitResults = new b3OpenCLArray(ctx, q); + m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray(ctx, q); + m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray(ctx, q); + m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray(ctx, q); + m_data->m_gpuRayRigidPairs = new b3OpenCLArray(ctx, q); { cl_int errNum=0; @@ -39,6 +73,10 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue b3Assert(errNum==CL_SUCCESS); m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog); b3Assert(errNum==CL_SUCCESS); + m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); + m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog); + b3Assert(errNum==CL_SUCCESS); clReleaseProgram(prog); } @@ -48,6 +86,20 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue b3GpuRaycast::~b3GpuRaycast() { clReleaseKernel(m_data->m_raytraceKernel); + clReleaseKernel(m_data->m_raytracePairsKernel); + clReleaseKernel(m_data->m_findRayRigidPairIndexRanges); + + delete m_data->m_plbvh; + delete m_data->m_radixSorter; + delete m_data->m_fill; + + delete m_data->m_gpuRays; + delete m_data->m_gpuHitResults; + delete m_data->m_firstRayRigidPairIndexPerRay; + delete m_data->m_numRayRigidPairsPerRay; + delete m_data->m_gpuNumRayRigidPairs; + delete m_data->m_gpuRayRigidPairs; + delete m_data; } @@ -206,27 +258,32 @@ void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray& rays, b3A } ///todo: add some acceleration structure (AABBs, tree etc) void b3GpuRaycast::castRays(const b3AlignedObjectArray& rays, b3AlignedObjectArray& hitResults, - int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData) + int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, + const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase) { - //castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData); B3_PROFILE("castRaysGPU"); - - b3OpenCLArray gpuRays(m_data->m_context,m_data->m_q); - b3OpenCLArray gpuHitResults(m_data->m_context,m_data->m_q); - + { B3_PROFILE("raycast copyFromHost"); - gpuRays.copyFromHost(rays); - - - gpuHitResults.resize(hitResults.size()); - gpuHitResults.copyFromHost(hitResults); + m_data->m_gpuRays->copyFromHost(rays); + m_data->m_gpuHitResults->copyFromHost(hitResults); + } - - + + int numRays = hitResults.size(); + { + m_data->m_firstRayRigidPairIndexPerRay->resize(numRays); + m_data->m_numRayRigidPairsPerRay->resize(numRays); + + m_data->m_gpuNumRayRigidPairs->resize(1); + m_data->m_gpuRayRigidPairs->resize(numRays * 16); + } + //run kernel + const bool USE_BRUTE_FORCE_RAYCAST = false; + if(USE_BRUTE_FORCE_RAYCAST) { B3_PROFILE("raycast launch1D"); @@ -234,8 +291,8 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray& rays, b3Align int numRays = rays.size(); launcher.setConst(numRays); - launcher.setBuffer(gpuRays.getBufferCL()); - launcher.setBuffer(gpuHitResults.getBufferCL()); + launcher.setBuffer(m_data->m_gpuRays->getBufferCL()); + launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL()); launcher.setConst(numBodies); launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL()); @@ -246,11 +303,89 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray& rays, b3Align launcher.launch1D(numRays); clFinish(m_data->m_q); } + else + { + m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() ); + + m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs); + + int numRayRigidPairs = -1; + m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1); + if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() ) + { + numRayRigidPairs = m_data->m_gpuRayRigidPairs->size(); + m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1); + } + + m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct + + //Sort ray-rigid pairs by ray index + { + B3_PROFILE("sort ray-rigid pairs"); + m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray* >(m_data->m_gpuRayRigidPairs) ); + } + + //detect start,count of each ray pair + { + B3_PROFILE("detect ray-rigid pair index ranges"); + + { + B3_PROFILE("reset ray-rigid pair index ranges"); + + m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index + m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays); + clFinish(m_data->m_q); + } + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ), + + b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ), + b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ) + }; + + b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numRayRigidPairs); + + launcher.launch1D(numRayRigidPairs); + clFinish(m_data->m_q); + } + + { + B3_PROFILE("ray-rigid intersection"); + + b3BufferInfoCL bufferInfo[] = + { + b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ), + b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ), + b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ), + b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ), + + b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ), + b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ), + b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ), + b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ), + + b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ) + }; + + b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel"); + launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) ); + launcher.setConst(numRays); + + launcher.launch1D(numRays); + clFinish(m_data->m_q); + } + } + + //copy results { B3_PROFILE("raycast copyToHost"); - gpuHitResults.copyToHost(hitResults); + m_data->m_gpuHitResults->copyToHost(hitResults); } } \ No newline at end of file diff --git a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h index 8b0cbd0be..3a5cf44b7 100644 --- a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h +++ b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h @@ -23,8 +23,7 @@ public: void castRays(const b3AlignedObjectArray& rays, b3AlignedObjectArray& hitResults, int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, - const struct b3GpuNarrowPhaseInternalData* narrowphaseData - ); + const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase); diff --git a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl index acf0bdd32..e72d96876 100644 --- a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl +++ b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl @@ -337,3 +337,103 @@ __kernel void rayCastKernel( } } + + +__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, + __global int* out_firstRayRigidPairIndexPerRay, + __global int* out_numRayRigidPairsPerRay, + int numRayRigidPairs) +{ + int rayRigidPairIndex = get_global_id(0); + if (rayRigidPairIndex >= numRayRigidPairs) return; + + int rayIndex = rayRigidPairs[rayRigidPairIndex].x; + + atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex); + atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]); +} + +__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, + __global b3RayHit* hitResults, + __global int* firstRayRigidPairIndexPerRay, + __global int* numRayRigidPairsPerRay, + + __global Body* bodies, + __global Collidable* collidables, + __global const b3GpuFace* faces, + __global const ConvexPolyhedronCL* convexShapes, + + __global int2* rayRigidPairs, + int numRays) +{ + int i = get_global_id(0); + if (i >= numRays) return; + + float4 rayFrom = rays[i].m_from; + float4 rayTo = rays[i].m_to; + + hitResults[i].m_hitFraction = 1.f; + + float hitFraction = 1.f; + float4 hitPoint; + float4 hitNormal; + int hitBodyIndex = -1; + + // + for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair) + { + int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i]; + int b = rayRigidPairs[rayRigidPairIndex].y; + + if (hitResults[i].m_hitResult2 == b) continue; + + Body body = bodies[b]; + Collidable rigidCollidable = collidables[body.m_collidableIdx]; + + float4 pos = body.m_pos; + float4 orn = body.m_quat; + + if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL) + { + float4 invPos = (float4)(0,0,0,0); + float4 invOrn = (float4)(0,0,0,0); + float4 rayFromLocal = (float4)(0,0,0,0); + float4 rayToLocal = (float4)(0,0,0,0); + invOrn = qtInvert(orn); + invPos = qtRotate(invOrn, -pos); + rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos; + rayToLocal = qtRotate( invOrn, rayTo) + invPos; + rayFromLocal.w = 0.f; + rayToLocal.w = 0.f; + int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces; + int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset; + + if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal)) + { + hitBodyIndex = b; + hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction); + } + } + + if (rigidCollidable.m_shapeType == SHAPE_SPHERE) + { + float radius = rigidCollidable.m_radius; + + if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction)) + { + hitBodyIndex = b; + hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction); + hitNormal = (float4) (hitPoint - bodies[b].m_pos); + } + } + } + + if (hitBodyIndex >= 0) + { + hitResults[i].m_hitFraction = hitFraction; + hitResults[i].m_hitPoint = hitPoint; + hitResults[i].m_hitNormal = normalize(hitNormal); + hitResults[i].m_hitResult0 = hitBodyIndex; + } + +} diff --git a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h index 159da7c17..6257909a4 100644 --- a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h +++ b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h @@ -281,4 +281,101 @@ static const char* rayCastKernelCL= \ " hitResults[i].m_hitResult0 = hitBodyIndex;\n" " }\n" "}\n" +"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n" +" __global int* out_firstRayRigidPairIndexPerRay,\n" +" __global int* out_numRayRigidPairsPerRay,\n" +" int numRayRigidPairs)\n" +"{\n" +" int rayRigidPairIndex = get_global_id(0);\n" +" if (rayRigidPairIndex >= numRayRigidPairs) return;\n" +" \n" +" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n" +" \n" +" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n" +" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n" +"}\n" +"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n" +" __global b3RayHit* hitResults, \n" +" __global int* firstRayRigidPairIndexPerRay,\n" +" __global int* numRayRigidPairsPerRay,\n" +" \n" +" __global Body* bodies,\n" +" __global Collidable* collidables,\n" +" __global const b3GpuFace* faces,\n" +" __global const ConvexPolyhedronCL* convexShapes,\n" +" \n" +" __global int2* rayRigidPairs,\n" +" int numRays)\n" +"{\n" +" int i = get_global_id(0);\n" +" if (i >= numRays) return;\n" +" \n" +" float4 rayFrom = rays[i].m_from;\n" +" float4 rayTo = rays[i].m_to;\n" +" \n" +" hitResults[i].m_hitFraction = 1.f;\n" +" \n" +" float hitFraction = 1.f;\n" +" float4 hitPoint;\n" +" float4 hitNormal;\n" +" int hitBodyIndex = -1;\n" +" \n" +" //\n" +" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n" +" {\n" +" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n" +" int b = rayRigidPairs[rayRigidPairIndex].y;\n" +" \n" +" if (hitResults[i].m_hitResult2 == b) continue;\n" +" \n" +" Body body = bodies[b];\n" +" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n" +" \n" +" float4 pos = body.m_pos;\n" +" float4 orn = body.m_quat;\n" +" \n" +" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n" +" {\n" +" float4 invPos = (float4)(0,0,0,0);\n" +" float4 invOrn = (float4)(0,0,0,0);\n" +" float4 rayFromLocal = (float4)(0,0,0,0);\n" +" float4 rayToLocal = (float4)(0,0,0,0);\n" +" invOrn = qtInvert(orn);\n" +" invPos = qtRotate(invOrn, -pos);\n" +" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n" +" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n" +" rayFromLocal.w = 0.f;\n" +" rayToLocal.w = 0.f;\n" +" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n" +" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n" +" \n" +" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n" +" {\n" +" hitBodyIndex = b;\n" +" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" +" }\n" +" }\n" +" \n" +" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n" +" {\n" +" float radius = rigidCollidable.m_radius;\n" +" \n" +" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n" +" {\n" +" hitBodyIndex = b;\n" +" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n" +" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n" +" }\n" +" }\n" +" }\n" +" \n" +" if (hitBodyIndex >= 0)\n" +" {\n" +" hitResults[i].m_hitFraction = hitFraction;\n" +" hitResults[i].m_hitPoint = hitPoint;\n" +" hitResults[i].m_hitNormal = normalize(hitNormal);\n" +" hitResults[i].m_hitResult0 = hitBodyIndex;\n" +" }\n" +" \n" +"}\n" ; diff --git a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp index e1665f406..783e44306 100644 --- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp +++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp @@ -703,6 +703,6 @@ void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray& ray { this->m_data->m_raycaster->castRays(rays,hitResults, getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(), - m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getInternalData() - ); + m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(), + m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap); }