Merge branch 'master' of https://github.com/bulletphysics/bullet3
This commit is contained in:
@@ -9,6 +9,7 @@
|
||||
#include "OpenGLWindow/b3gWindowInterface.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h"
|
||||
|
||||
#include "../GpuDemoInternalData.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
@@ -108,6 +109,7 @@ static int curSelectedBroadphase = 0;
|
||||
static BroadphaseEntry allBroadphases[]=
|
||||
{
|
||||
{"Gpu Grid",b3GpuGridBroadphase::CreateFunc},
|
||||
{"Parallel Linear BVH",b3GpuParallelLinearBvhBroadphase::CreateFunc},
|
||||
{"CPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceCpu},
|
||||
{"GPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceGpu},
|
||||
{"GPU 1-SAP Original",b3GpuSapBroadphase::CreateFuncOriginal},
|
||||
@@ -119,6 +121,7 @@ static BroadphaseEntry allBroadphases[]=
|
||||
struct PairBenchInternalData
|
||||
{
|
||||
b3GpuBroadphaseInterface* m_broadphaseGPU;
|
||||
b3GpuBroadphaseInterface* m_validationBroadphase;
|
||||
|
||||
cl_kernel m_moveObjectsKernel;
|
||||
cl_kernel m_sineWaveKernel;
|
||||
@@ -154,6 +157,8 @@ PairBench::PairBench()
|
||||
m_window(0)
|
||||
{
|
||||
m_data = new PairBenchInternalData;
|
||||
|
||||
m_data->m_validationBroadphase = 0;
|
||||
}
|
||||
PairBench::~PairBench()
|
||||
{
|
||||
@@ -505,7 +510,16 @@ void PairBench::initPhysics(const ConstructionInfo& ci)
|
||||
m_data->m_sineWaveKernel = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"sineWaveKernel",&errNum,pairBenchProg);
|
||||
m_data->m_colorPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"colorPairsKernel2",&errNum,pairBenchProg);
|
||||
m_data->m_updateAabbSimple = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"updateAabbSimple",&errNum,pairBenchProg);
|
||||
|
||||
|
||||
//Method for validating the overlapping pairs requires that the
|
||||
//reference broadphase does not maintain internal state aside from AABB data.
|
||||
//That is, overwriting the AABB state in the broadphase using
|
||||
// b3GpuBroadphaseInterface::getAllAabbsGPU(),
|
||||
// b3GpuBroadphaseInterface::getSmallAabbIndicesGPU(), and
|
||||
// b3GpuBroadphaseInterface::getLargeAabbIndicesGPU()
|
||||
//and then calling b3GpuBroadphaseInterface::calculateOverlappingPairs() should
|
||||
//always produce the same result regardless of the current state of the broadphase.
|
||||
m_data->m_validationBroadphase = b3GpuParallelLinearBvhBroadphase::CreateFunc(m_clData->m_clContext,m_clData->m_clDevice,m_clData->m_clQueue);
|
||||
}
|
||||
|
||||
if (ci.m_window)
|
||||
@@ -741,6 +755,12 @@ void PairBench::deleteBroadphase()
|
||||
|
||||
void PairBench::exitPhysics()
|
||||
{
|
||||
if(m_data->m_validationBroadphase)
|
||||
{
|
||||
delete m_data->m_validationBroadphase;
|
||||
m_data->m_validationBroadphase = 0;
|
||||
}
|
||||
|
||||
#ifdef B3_USE_MIDI
|
||||
if (m_data->m_midiIn)
|
||||
{
|
||||
@@ -768,6 +788,17 @@ void PairBench::renderScene()
|
||||
m_instancingRenderer->renderScene();
|
||||
}
|
||||
|
||||
struct OverlappingPairSortPredicate
|
||||
{
|
||||
inline bool operator() (const b3Int4& a, const b3Int4& b) const
|
||||
{
|
||||
if(a.x != b.x) return (a.x < b.x);
|
||||
if(a.y != b.y) return (a.y < b.y);
|
||||
if(a.z != b.z) return (a.z < b.z);
|
||||
return (a.w < b.w);
|
||||
}
|
||||
};
|
||||
|
||||
void PairBench::clientMoveAndDisplay()
|
||||
{
|
||||
//color all objects blue
|
||||
@@ -901,7 +932,10 @@ void PairBench::clientMoveAndDisplay()
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int prealloc = 3*1024*1024;
|
||||
int maxOverlap = b3Min(prealloc,16*numObjects);
|
||||
|
||||
unsigned long dt = 0;
|
||||
if (numObjects)
|
||||
{
|
||||
@@ -910,16 +944,104 @@ void PairBench::clientMoveAndDisplay()
|
||||
B3_PROFILE("calculateOverlappingPairs");
|
||||
int sz = sizeof(b3Int4)*64*numObjects;
|
||||
|
||||
int prealloc = 3*1024*1024;
|
||||
|
||||
int maxOverlap = b3Min(prealloc,16*numObjects);
|
||||
|
||||
m_data->m_broadphaseGPU->calculateOverlappingPairs(maxOverlap);
|
||||
int numPairs = m_data->m_broadphaseGPU->getNumOverlap();
|
||||
//printf("numPairs = %d\n", numPairs);
|
||||
dt = cl.getTimeMicroseconds()-dt;
|
||||
|
||||
}
|
||||
|
||||
const bool VALIDATE_BROADPHASE = false; //Check that overlapping pairs of 2 broadphases are the same
|
||||
if(numObjects && VALIDATE_BROADPHASE)
|
||||
{
|
||||
B3_PROFILE("validate broadphases");
|
||||
|
||||
{
|
||||
B3_PROFILE("calculateOverlappingPairs m_validationBroadphase");
|
||||
//m_data->m_validationBroadphase->getAllAabbsCPU() = m_data->m_broadphaseGPU->getAllAabbsCPU();
|
||||
|
||||
m_data->m_validationBroadphase->getAllAabbsGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getAllAabbsGPU() );
|
||||
m_data->m_validationBroadphase->getSmallAabbIndicesGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getSmallAabbIndicesGPU() );
|
||||
m_data->m_validationBroadphase->getLargeAabbIndicesGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getLargeAabbIndicesGPU() );
|
||||
|
||||
m_data->m_validationBroadphase->calculateOverlappingPairs(maxOverlap);
|
||||
}
|
||||
|
||||
static b3AlignedObjectArray<b3Int4> overlappingPairs;
|
||||
static b3AlignedObjectArray<b3Int4> overlappingPairsReference;
|
||||
m_data->m_broadphaseGPU->getOverlappingPairsGPU().copyToHost(overlappingPairs);
|
||||
m_data->m_validationBroadphase->getOverlappingPairsGPU().copyToHost(overlappingPairsReference);
|
||||
|
||||
//Reorder pairs so that (pair.x < pair.y) is always true
|
||||
{
|
||||
B3_PROFILE("reorder pairs");
|
||||
|
||||
for(int i = 0; i < overlappingPairs.size(); ++i)
|
||||
{
|
||||
b3Int4 pair = overlappingPairs[i];
|
||||
if(pair.x > pair.y)
|
||||
{
|
||||
b3Swap(pair.x, pair.y);
|
||||
b3Swap(pair.z, pair.w);
|
||||
overlappingPairs[i] = pair;
|
||||
}
|
||||
}
|
||||
for(int i = 0; i < overlappingPairsReference.size(); ++i)
|
||||
{
|
||||
b3Int4 pair = overlappingPairsReference[i];
|
||||
if(pair.x > pair.y)
|
||||
{
|
||||
b3Swap(pair.x, pair.y);
|
||||
b3Swap(pair.z, pair.w);
|
||||
overlappingPairsReference[i] = pair;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
{
|
||||
B3_PROFILE("Sort overlapping pairs from most to least significant bit");
|
||||
|
||||
overlappingPairs.quickSort( OverlappingPairSortPredicate() );
|
||||
overlappingPairsReference.quickSort( OverlappingPairSortPredicate() );
|
||||
}
|
||||
|
||||
//Compare
|
||||
{
|
||||
B3_PROFILE("compare pairs");
|
||||
|
||||
int numPairs = overlappingPairs.size();
|
||||
int numPairsReference = overlappingPairsReference.size();
|
||||
|
||||
bool success = true;
|
||||
|
||||
if(numPairs == numPairsReference)
|
||||
{
|
||||
for(int i = 0; i < numPairsReference; ++i)
|
||||
{
|
||||
const b3Int4& pairA = overlappingPairs[i];
|
||||
const b3Int4& pairB = overlappingPairsReference[i];
|
||||
if( pairA.x != pairB.x
|
||||
|| pairA.y != pairB.y
|
||||
|| pairA.z != pairB.z
|
||||
|| pairA.w != pairB.w )
|
||||
{
|
||||
b3Error("Error: one or more overlappingPairs differs from reference.\n");
|
||||
success = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
b3Error("Error: numPairs %d != numPairsReference %d \n", numPairs, numPairsReference);
|
||||
success = false;
|
||||
}
|
||||
|
||||
printf("Broadphase validation: %d \n", success);
|
||||
}
|
||||
}
|
||||
|
||||
if (m_data->m_gui)
|
||||
{
|
||||
|
||||
@@ -187,6 +187,10 @@ void GpuRaytraceScene::renderScene()
|
||||
|
||||
void GpuRaytraceScene::renderScene2()
|
||||
{
|
||||
//If using the BVH to accelerate raycasting, the AABBs need to be updated or else they will
|
||||
//not match the actual rigid body positions after integration. The result is that rigid bodies
|
||||
//are not drawn or appear clipped, especially if they are moving quickly.
|
||||
m_data->m_rigidBodyPipeline->setupGpuAabbsFull();
|
||||
|
||||
// GpuBoxPlaneScene::renderScene();
|
||||
// return;
|
||||
@@ -308,7 +312,7 @@ void GpuRaytraceScene::renderScene2()
|
||||
{
|
||||
B3_PROFILE("cast primary rays");
|
||||
//m_raycaster->castRaysHost(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(),m_data->m_np->getInternalData());
|
||||
m_raycaster->castRays(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData());
|
||||
m_raycaster->castRays(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData(), m_data->m_bp);
|
||||
}
|
||||
|
||||
|
||||
@@ -350,7 +354,7 @@ void GpuRaytraceScene::renderScene2()
|
||||
{
|
||||
B3_PROFILE("cast shadow rays");
|
||||
//m_raycaster->castRaysHost(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu());
|
||||
m_raycaster->castRays(shadowRays, shadowHits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData());
|
||||
m_raycaster->castRays(shadowRays, shadowHits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData(), m_data->m_bp);
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
@@ -13,6 +13,7 @@ premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/Broadphas
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
|
||||
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
|
||||
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -14,6 +14,7 @@ rem @echo off
|
||||
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
|
||||
|
||||
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
|
||||
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify
|
||||
|
||||
|
||||
|
||||
|
||||
BIN
docs/b3GpuParallelLinearBvh.pdf
Normal file
BIN
docs/b3GpuParallelLinearBvh.pdf
Normal file
Binary file not shown.
@@ -34,6 +34,10 @@ public:
|
||||
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0;
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0;
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
|
||||
|
||||
};
|
||||
|
||||
|
||||
@@ -366,4 +366,18 @@ b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
|
||||
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
|
||||
{
|
||||
return m_allAabbsCPU1;
|
||||
}
|
||||
}
|
||||
|
||||
b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU()
|
||||
{
|
||||
return m_gpuPairs;
|
||||
}
|
||||
b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU()
|
||||
{
|
||||
return m_smallAabbsMappingGPU;
|
||||
}
|
||||
b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
|
||||
{
|
||||
return m_largeAabbsMappingGPU;
|
||||
}
|
||||
|
||||
|
||||
@@ -78,6 +78,10 @@ public:
|
||||
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
|
||||
|
||||
};
|
||||
|
||||
|
||||
577
src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
Normal file
577
src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
Normal file
@@ -0,0 +1,577 @@
|
||||
/*
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Initial Author Jackson Lee, 2014
|
||||
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
|
||||
#include "b3GpuParallelLinearBvh.h"
|
||||
|
||||
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
|
||||
m_queue(queue),
|
||||
m_radixSorter(context, device, queue),
|
||||
|
||||
m_rootNodeIndex(context, queue),
|
||||
m_maxDistanceFromRoot(context, queue),
|
||||
m_temp(context, queue),
|
||||
|
||||
m_internalNodeAabbs(context, queue),
|
||||
m_internalNodeLeafIndexRanges(context, queue),
|
||||
m_internalNodeChildNodes(context, queue),
|
||||
m_internalNodeParentNodes(context, queue),
|
||||
|
||||
m_commonPrefixes(context, queue),
|
||||
m_commonPrefixLengths(context, queue),
|
||||
m_distanceFromRoot(context, queue),
|
||||
|
||||
m_leafNodeParentNodes(context, queue),
|
||||
m_mortonCodesAndAabbIndicies(context, queue),
|
||||
m_mergedAabb(context, queue),
|
||||
m_leafNodeAabbs(context, queue),
|
||||
|
||||
m_largeAabbs(context, queue)
|
||||
{
|
||||
m_rootNodeIndex.resize(1);
|
||||
m_maxDistanceFromRoot.resize(1);
|
||||
m_temp.resize(1);
|
||||
|
||||
//
|
||||
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
|
||||
|
||||
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
|
||||
cl_int error;
|
||||
char* additionalMacros = 0;
|
||||
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
|
||||
b3Assert(m_parallelLinearBvhProgram);
|
||||
|
||||
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_separateAabbsKernel);
|
||||
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_findAllNodesMergedAabbKernel);
|
||||
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
|
||||
|
||||
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_computeAdjacentPairCommonPrefixKernel);
|
||||
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
|
||||
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
|
||||
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_findDistanceFromRootKernel);
|
||||
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
|
||||
|
||||
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_findLeafIndexRangesKernel);
|
||||
|
||||
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
|
||||
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_plbvhRayTraverseKernel);
|
||||
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_plbvhLargeAabbAabbTestKernel);
|
||||
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
b3Assert(m_plbvhLargeAabbRayTestKernel);
|
||||
}
|
||||
|
||||
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
|
||||
{
|
||||
clReleaseKernel(m_separateAabbsKernel);
|
||||
clReleaseKernel(m_findAllNodesMergedAabbKernel);
|
||||
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
|
||||
|
||||
clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
|
||||
clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
|
||||
clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
|
||||
clReleaseKernel(m_findDistanceFromRootKernel);
|
||||
clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
|
||||
|
||||
clReleaseKernel(m_findLeafIndexRangesKernel);
|
||||
|
||||
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
|
||||
clReleaseKernel(m_plbvhRayTraverseKernel);
|
||||
clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
|
||||
clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
|
||||
|
||||
clReleaseProgram(m_parallelLinearBvhProgram);
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
|
||||
const b3OpenCLArray<int>& largeAabbIndices)
|
||||
{
|
||||
B3_PROFILE("b3ParallelLinearBvh::build()");
|
||||
|
||||
int numLargeAabbs = largeAabbIndices.size();
|
||||
int numSmallAabbs = smallAabbIndices.size();
|
||||
|
||||
//Since all AABBs(both large and small) are input as a contiguous array,
|
||||
//with 2 additional arrays used to indicate the indices of large and small AABBs,
|
||||
//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
|
||||
{
|
||||
B3_PROFILE("Separate large and small AABBs");
|
||||
|
||||
m_largeAabbs.resize(numLargeAabbs);
|
||||
m_leafNodeAabbs.resize(numSmallAabbs);
|
||||
|
||||
//Write large AABBs into m_largeAabbs
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_largeAabbs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numLargeAabbs);
|
||||
|
||||
launcher.launch1D(numLargeAabbs);
|
||||
}
|
||||
|
||||
//Write small AABBs into m_leafNodeAabbs
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numSmallAabbs);
|
||||
|
||||
launcher.launch1D(numSmallAabbs);
|
||||
}
|
||||
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//
|
||||
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
|
||||
int numInternalNodes = numLeaves - 1;
|
||||
|
||||
if(numLeaves < 2)
|
||||
{
|
||||
//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
|
||||
//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
|
||||
int rootNodeIndex = numLeaves - 1;
|
||||
m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
|
||||
|
||||
//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
|
||||
//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
|
||||
//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
|
||||
//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
|
||||
if(numLeaves == 1)
|
||||
{
|
||||
b3SortData leaf;
|
||||
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
|
||||
|
||||
m_mortonCodesAndAabbIndicies.resize(1);
|
||||
m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
//
|
||||
{
|
||||
m_internalNodeAabbs.resize(numInternalNodes);
|
||||
m_internalNodeLeafIndexRanges.resize(numInternalNodes);
|
||||
m_internalNodeChildNodes.resize(numInternalNodes);
|
||||
m_internalNodeParentNodes.resize(numInternalNodes);
|
||||
|
||||
m_commonPrefixes.resize(numInternalNodes);
|
||||
m_commonPrefixLengths.resize(numInternalNodes);
|
||||
m_distanceFromRoot.resize(numInternalNodes);
|
||||
|
||||
m_leafNodeParentNodes.resize(numLeaves);
|
||||
m_mortonCodesAndAabbIndicies.resize(numLeaves);
|
||||
m_mergedAabb.resize(numLeaves);
|
||||
}
|
||||
|
||||
//Find the merged AABB of all small AABBs; this is used to define the size of
|
||||
//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
|
||||
{
|
||||
B3_PROFILE("Find AABB of merged nodes");
|
||||
|
||||
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
|
||||
|
||||
for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
|
||||
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0]
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numAabbsNeedingMerge);
|
||||
|
||||
launcher.launch1D(numAabbsNeedingMerge);
|
||||
}
|
||||
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//Insert the center of the AABBs into a virtual grid,
|
||||
//then convert the discrete grid coordinates into a morton code
|
||||
//For each element in m_mortonCodesAndAabbIndicies, set
|
||||
// m_key == morton code (value to sort by)
|
||||
// m_value == small AABB index
|
||||
{
|
||||
B3_PROFILE("Assign morton codes");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numLeaves);
|
||||
|
||||
launcher.launch1D(numLeaves);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//
|
||||
{
|
||||
B3_PROFILE("Sort leaves by morton codes");
|
||||
|
||||
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//
|
||||
constructBinaryRadixTree();
|
||||
|
||||
|
||||
//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
|
||||
//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
|
||||
//The child nodes of each node split their parent's index range into 2 contiguous halves.
|
||||
//
|
||||
//For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
|
||||
//The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
|
||||
//
|
||||
//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
|
||||
{
|
||||
B3_PROFILE("m_findLeafIndexRangesKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
|
||||
{
|
||||
int maxPairs = out_overlappingPairs.size();
|
||||
b3OpenCLArray<int>& numPairsGpu = m_temp;
|
||||
|
||||
int reset = 0;
|
||||
numPairsGpu.copyFromHostPointer(&reset, 1);
|
||||
|
||||
//
|
||||
if( m_leafNodeAabbs.size() > 1 )
|
||||
{
|
||||
B3_PROFILE("PLBVH small-small AABB test");
|
||||
|
||||
int numQueryAabbs = m_leafNodeAabbs.size();
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
|
||||
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(maxPairs);
|
||||
launcher.setConst(numQueryAabbs);
|
||||
|
||||
launcher.launch1D(numQueryAabbs);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
int numLargeAabbRigids = m_largeAabbs.size();
|
||||
if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
|
||||
{
|
||||
B3_PROFILE("PLBVH large-small AABB test");
|
||||
|
||||
int numQueryAabbs = m_leafNodeAabbs.size();
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
|
||||
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(maxPairs);
|
||||
launcher.setConst(numLargeAabbRigids);
|
||||
launcher.setConst(numQueryAabbs);
|
||||
|
||||
launcher.launch1D(numQueryAabbs);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
int numPairs = -1;
|
||||
numPairsGpu.copyToHostPointer(&numPairs, 1);
|
||||
if(numPairs > maxPairs)
|
||||
{
|
||||
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
|
||||
numPairs = maxPairs;
|
||||
numPairsGpu.copyFromHostPointer(&maxPairs, 1);
|
||||
}
|
||||
|
||||
out_overlappingPairs.resize(numPairs);
|
||||
}
|
||||
|
||||
|
||||
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
|
||||
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
|
||||
{
|
||||
B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
|
||||
|
||||
int numRays = rays.size();
|
||||
int maxRayRigidPairs = out_rayRigidPairs.size();
|
||||
|
||||
int reset = 0;
|
||||
out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
|
||||
|
||||
//
|
||||
if( m_leafNodeAabbs.size() > 0 )
|
||||
{
|
||||
B3_PROFILE("PLBVH ray test small AABB");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( rays.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
|
||||
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(maxRayRigidPairs);
|
||||
launcher.setConst(numRays);
|
||||
|
||||
launcher.launch1D(numRays);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
int numLargeAabbRigids = m_largeAabbs.size();
|
||||
if(numLargeAabbRigids > 0)
|
||||
{
|
||||
B3_PROFILE("PLBVH ray test large AABB");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( rays.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
|
||||
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numLargeAabbRigids);
|
||||
launcher.setConst(maxRayRigidPairs);
|
||||
launcher.setConst(numRays);
|
||||
|
||||
launcher.launch1D(numRays);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//
|
||||
int numRayRigidPairs = -1;
|
||||
out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
|
||||
|
||||
if(numRayRigidPairs > maxRayRigidPairs)
|
||||
b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
|
||||
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvh::constructBinaryRadixTree()
|
||||
{
|
||||
B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
|
||||
|
||||
int numLeaves = m_leafNodeAabbs.size();
|
||||
int numInternalNodes = numLeaves - 1;
|
||||
|
||||
//Each internal node is placed in between 2 leaf nodes.
|
||||
//By using this arrangement and computing the common prefix between
|
||||
//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
|
||||
{
|
||||
B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//For each leaf node, select its parent node by
|
||||
//comparing the 2 nearest internal nodes and assign child node indices
|
||||
{
|
||||
B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
|
||||
b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numLeaves);
|
||||
|
||||
launcher.launch1D(numLeaves);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//For each internal node, perform 2 binary searches among the other internal nodes
|
||||
//to its left and right to find its potential parent nodes and assign child node indices
|
||||
{
|
||||
B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//Find the number of nodes seperating each internal node and the root node
|
||||
//so that the AABBs can be set using the next kernel.
|
||||
//Also determine the maximum number of nodes separating an internal node and the root node.
|
||||
{
|
||||
B3_PROFILE("m_findDistanceFromRootKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
|
||||
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//Starting from the internal nodes nearest to the leaf nodes, recursively move up
|
||||
//the tree towards the root to set the AABBs of each internal node; each internal node
|
||||
//checks its children and merges their AABBs
|
||||
{
|
||||
B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
|
||||
|
||||
int maxDistanceFromRoot = -1;
|
||||
{
|
||||
B3_PROFILE("copy maxDistanceFromRoot to CPU");
|
||||
m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(maxDistanceFromRoot);
|
||||
launcher.setConst(distanceFromRoot);
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
//It may seem inefficent to launch a thread for each internal node when a
|
||||
//much smaller number of nodes is actually processed, but this is actually
|
||||
//faster than determining the exact nodes that are ready to merge their child AABBs.
|
||||
launcher.launch1D(numInternalNodes);
|
||||
}
|
||||
|
||||
clFinish(m_queue);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
125
src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
Normal file
125
src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
Normal file
@@ -0,0 +1,125 @@
|
||||
/*
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Initial Author Jackson Lee, 2014
|
||||
|
||||
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H
|
||||
#define B3_GPU_PARALLEL_LINEAR_BVH_H
|
||||
|
||||
//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
|
||||
#include "Bullet3Common/shared/b3Int2.h"
|
||||
#include "Bullet3Common/shared/b3Int4.h"
|
||||
#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
|
||||
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
|
||||
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h"
|
||||
|
||||
#define b3Int64 cl_long
|
||||
|
||||
///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame
|
||||
///@remarks
|
||||
///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details.
|
||||
///@par
|
||||
///Related papers: \n
|
||||
///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n
|
||||
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
|
||||
///@par
|
||||
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
|
||||
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
|
||||
/// - [fully parallel] Sort morton codes
|
||||
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
|
||||
/// - [somewhat parallel] Set internal node AABBs
|
||||
///@par
|
||||
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
|
||||
///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
|
||||
///Instead of searching for the child nodes of each internal node, we search for the parent node of each node.
|
||||
///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs.
|
||||
class b3GpuParallelLinearBvh
|
||||
{
|
||||
cl_command_queue m_queue;
|
||||
|
||||
cl_program m_parallelLinearBvhProgram;
|
||||
|
||||
cl_kernel m_separateAabbsKernel;
|
||||
cl_kernel m_findAllNodesMergedAabbKernel;
|
||||
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
|
||||
|
||||
//Binary radix tree construction kernels
|
||||
cl_kernel m_computeAdjacentPairCommonPrefixKernel;
|
||||
cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
|
||||
cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
|
||||
cl_kernel m_findDistanceFromRootKernel;
|
||||
cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
|
||||
|
||||
cl_kernel m_findLeafIndexRangesKernel;
|
||||
|
||||
//Traversal kernels
|
||||
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
|
||||
cl_kernel m_plbvhRayTraverseKernel;
|
||||
cl_kernel m_plbvhLargeAabbAabbTestKernel;
|
||||
cl_kernel m_plbvhLargeAabbRayTestKernel;
|
||||
|
||||
b3RadixSort32CL m_radixSorter;
|
||||
|
||||
//1 element
|
||||
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
|
||||
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
|
||||
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
|
||||
|
||||
//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
|
||||
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
|
||||
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
|
||||
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
|
||||
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
|
||||
|
||||
//1 element per internal node; for binary radix tree construction
|
||||
b3OpenCLArray<b3Int64> m_commonPrefixes;
|
||||
b3OpenCLArray<int> m_commonPrefixLengths;
|
||||
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
|
||||
|
||||
//1 element per leaf node (leaf nodes only include small AABBs)
|
||||
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
|
||||
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
|
||||
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
|
||||
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
|
||||
|
||||
//1 element per large AABB, which is not stored in the BVH
|
||||
b3OpenCLArray<b3SapAabb> m_largeAabbs;
|
||||
|
||||
public:
|
||||
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
|
||||
virtual ~b3GpuParallelLinearBvh();
|
||||
|
||||
///Must be called before any other function
|
||||
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
|
||||
const b3OpenCLArray<int>& largeAabbIndices);
|
||||
|
||||
///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
|
||||
///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
|
||||
///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
|
||||
void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
|
||||
|
||||
///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
|
||||
///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
|
||||
///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
|
||||
///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
|
||||
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
|
||||
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
|
||||
|
||||
private:
|
||||
void constructBinaryRadixTree();
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Initial Author Jackson Lee, 2014
|
||||
|
||||
#include "b3GpuParallelLinearBvhBroadphase.h"
|
||||
|
||||
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :
|
||||
m_plbvh(context, device, queue),
|
||||
|
||||
m_overlappingPairsGpu(context, queue),
|
||||
|
||||
m_aabbsGpu(context, queue),
|
||||
m_smallAabbsMappingGpu(context, queue),
|
||||
m_largeAabbsMappingGpu(context, queue)
|
||||
{
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
|
||||
{
|
||||
int newAabbIndex = m_aabbsCpu.size();
|
||||
|
||||
b3SapAabb aabb;
|
||||
aabb.m_minVec = aabbMin;
|
||||
aabb.m_maxVec = aabbMax;
|
||||
|
||||
aabb.m_minIndices[3] = userPtr;
|
||||
aabb.m_signedMaxIndices[3] = newAabbIndex;
|
||||
|
||||
m_smallAabbsMappingCpu.push_back(newAabbIndex);
|
||||
|
||||
m_aabbsCpu.push_back(aabb);
|
||||
}
|
||||
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
|
||||
{
|
||||
int newAabbIndex = m_aabbsCpu.size();
|
||||
|
||||
b3SapAabb aabb;
|
||||
aabb.m_minVec = aabbMin;
|
||||
aabb.m_maxVec = aabbMax;
|
||||
|
||||
aabb.m_minIndices[3] = userPtr;
|
||||
aabb.m_signedMaxIndices[3] = newAabbIndex;
|
||||
|
||||
m_largeAabbsMappingCpu.push_back(newAabbIndex);
|
||||
|
||||
m_aabbsCpu.push_back(aabb);
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
|
||||
{
|
||||
//Reconstruct BVH
|
||||
m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
|
||||
|
||||
//
|
||||
m_overlappingPairsGpu.resize(maxPairs);
|
||||
m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
|
||||
}
|
||||
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
|
||||
{
|
||||
b3Assert(0); //CPU version not implemented
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
|
||||
{
|
||||
m_aabbsGpu.copyFromHost(m_aabbsCpu);
|
||||
m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
|
||||
m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
/*
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Initial Author Jackson Lee, 2014
|
||||
|
||||
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
|
||||
#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
|
||||
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
|
||||
|
||||
#include "b3GpuParallelLinearBvh.h"
|
||||
|
||||
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
|
||||
{
|
||||
b3GpuParallelLinearBvh m_plbvh;
|
||||
|
||||
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
|
||||
b3OpenCLArray<int> m_smallAabbsMappingGpu;
|
||||
b3OpenCLArray<int> m_largeAabbsMappingGpu;
|
||||
|
||||
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
|
||||
b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
|
||||
b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
|
||||
|
||||
public:
|
||||
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
|
||||
virtual ~b3GpuParallelLinearBvhBroadphase() {}
|
||||
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask);
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask);
|
||||
|
||||
virtual void calculateOverlappingPairs(int maxPairs);
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs);
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
virtual void writeAabbsToGpu();
|
||||
|
||||
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
|
||||
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
|
||||
|
||||
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
|
||||
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
|
||||
|
||||
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
|
||||
{
|
||||
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
|
||||
}
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -1307,3 +1307,16 @@ cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
|
||||
{
|
||||
return m_overlappingPairs.getBufferCL();
|
||||
}
|
||||
|
||||
b3OpenCLArray<b3Int4>& b3GpuSapBroadphase::getOverlappingPairsGPU()
|
||||
{
|
||||
return m_overlappingPairs;
|
||||
}
|
||||
b3OpenCLArray<int>& b3GpuSapBroadphase::getSmallAabbIndicesGPU()
|
||||
{
|
||||
return m_smallAabbsMappingGPU;
|
||||
}
|
||||
b3OpenCLArray<int>& b3GpuSapBroadphase::getLargeAabbIndicesGPU()
|
||||
{
|
||||
return m_largeAabbsMappingGPU;
|
||||
}
|
||||
|
||||
@@ -143,6 +143,9 @@ public:
|
||||
virtual int getNumOverlap();
|
||||
virtual cl_mem getOverlappingPairBuffer();
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
|
||||
};
|
||||
|
||||
#endif //B3_GPU_SAP_BROADPHASE_H
|
||||
@@ -0,0 +1,767 @@
|
||||
/*
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Initial Author Jackson Lee, 2014
|
||||
|
||||
typedef float b3Scalar;
|
||||
typedef float4 b3Vector3;
|
||||
#define b3Max max
|
||||
#define b3Min min
|
||||
#define b3Sqrt sqrt
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned int m_key;
|
||||
unsigned int m_value;
|
||||
} SortDataCL;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
float4 m_min;
|
||||
float m_minElems[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float4 m_max;
|
||||
float m_maxElems[4];
|
||||
int m_maxIndices[4];
|
||||
};
|
||||
} b3AabbCL;
|
||||
|
||||
|
||||
unsigned int interleaveBits(unsigned int x)
|
||||
{
|
||||
//........ ........ ......12 3456789A //x
|
||||
//....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits
|
||||
|
||||
//......12 3456789A ......12 3456789A //x ^ (x << 16)
|
||||
//11111111 ........ ........ 11111111 //0x FF 00 00 FF
|
||||
//......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;
|
||||
|
||||
//......12 ........ 3456789A 3456789A //x ^ (x << 8)
|
||||
//......11 ........ 1111.... ....1111 //0x 03 00 F0 0F
|
||||
//......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;
|
||||
|
||||
//..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)
|
||||
//......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3
|
||||
//......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;
|
||||
|
||||
//....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)
|
||||
//....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49
|
||||
//....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;
|
||||
|
||||
//........ ........ ......11 11111111 //0x000003FF
|
||||
x &= 0x000003FF; //Clear all bits above bit 10
|
||||
|
||||
x = (x ^ (x << 16)) & 0xFF0000FF;
|
||||
x = (x ^ (x << 8)) & 0x0300F00F;
|
||||
x = (x ^ (x << 4)) & 0x030C30C3;
|
||||
x = (x ^ (x << 2)) & 0x09249249;
|
||||
|
||||
return x;
|
||||
}
|
||||
unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
|
||||
{
|
||||
return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
|
||||
}
|
||||
|
||||
__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)
|
||||
{
|
||||
int separatedAabbIndex = get_global_id(0);
|
||||
if(separatedAabbIndex >= numAabbsToSeparate) return;
|
||||
|
||||
int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];
|
||||
out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];
|
||||
}
|
||||
|
||||
//Should replace with an optimized parallel reduction
|
||||
__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)
|
||||
{
|
||||
//Each time this kernel is added to the command queue,
|
||||
//the number of AABBs needing to be merged is halved
|
||||
//
|
||||
//Example with 159 AABBs:
|
||||
// numRemainingAabbs == 159 / 2 + 159 % 2 == 80
|
||||
// numMergedAabbs == 159 - 80 == 79
|
||||
//So, indices [0, 78] are merged with [0 + 80, 78 + 80]
|
||||
|
||||
int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;
|
||||
int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;
|
||||
|
||||
int aabbIndex = get_global_id(0);
|
||||
if(aabbIndex >= numMergedAabbs) return;
|
||||
|
||||
int otherAabbIndex = aabbIndex + numRemainingAabbs;
|
||||
|
||||
b3AabbCL aabb = out_mergedAabb[aabbIndex];
|
||||
b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
|
||||
|
||||
b3AabbCL mergedAabb;
|
||||
mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
|
||||
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
|
||||
out_mergedAabb[aabbIndex] = mergedAabb;
|
||||
}
|
||||
|
||||
__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes,
|
||||
__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)
|
||||
{
|
||||
int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index
|
||||
if(leafNodeIndex >= numAabbs) return;
|
||||
|
||||
b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];
|
||||
b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;
|
||||
b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;
|
||||
|
||||
b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
|
||||
b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;
|
||||
b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;
|
||||
|
||||
//Quantize into integer coordinates
|
||||
//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
|
||||
b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;
|
||||
|
||||
int4 discretePosition;
|
||||
discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
|
||||
discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );
|
||||
discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );
|
||||
|
||||
//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]
|
||||
discretePosition = b3Max( -512, b3Min(discretePosition, 511) );
|
||||
discretePosition += 512;
|
||||
|
||||
//Interleave bits(assign a morton code, also known as a z-curve)
|
||||
unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);
|
||||
|
||||
//
|
||||
SortDataCL mortonCodeIndexPair;
|
||||
mortonCodeIndexPair.m_key = mortonCode;
|
||||
mortonCodeIndexPair.m_value = leafNodeIndex;
|
||||
|
||||
out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;
|
||||
}
|
||||
|
||||
#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128
|
||||
|
||||
//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.
|
||||
//If it is set, then the index is for an internal node; otherwise, it is a leaf node.
|
||||
//In both cases, the bit should be cleared to access the actual node index.
|
||||
int isLeafNode(int index) { return (index >> 31 == 0); }
|
||||
int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }
|
||||
int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }
|
||||
|
||||
//From sap.cl
|
||||
#define NEW_PAIR_MARKER -1
|
||||
|
||||
bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
//From sap.cl
|
||||
|
||||
__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
|
||||
|
||||
__global int* rootNodeIndex,
|
||||
__global int2* internalNodeChildIndices,
|
||||
__global b3AabbCL* internalNodeAabbs,
|
||||
__global int2* internalNodeLeafIndexRanges,
|
||||
|
||||
__global SortDataCL* mortonCodesAndAabbIndices,
|
||||
__global int* out_numPairs, __global int4* out_overlappingPairs,
|
||||
int maxPairs, int numQueryAabbs)
|
||||
{
|
||||
//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since
|
||||
//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)
|
||||
int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
|
||||
if(queryBvhNodeIndex >= numQueryAabbs) return;
|
||||
|
||||
int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;
|
||||
b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];
|
||||
|
||||
int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
|
||||
|
||||
int stackSize = 1;
|
||||
stack[0] = *rootNodeIndex;
|
||||
|
||||
while(stackSize)
|
||||
{
|
||||
int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
|
||||
--stackSize;
|
||||
|
||||
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
|
||||
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
|
||||
|
||||
//Optimization - if the BVH is structured as a binary radix tree, then
|
||||
//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).
|
||||
//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.
|
||||
{
|
||||
int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;
|
||||
if(highestLeafIndex <= queryBvhNodeIndex) continue;
|
||||
}
|
||||
|
||||
//bvhRigidIndex is not used if internal node
|
||||
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
|
||||
|
||||
b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
|
||||
if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
|
||||
{
|
||||
if(isLeaf)
|
||||
{
|
||||
int4 pair;
|
||||
pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
|
||||
pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
|
||||
pair.z = NEW_PAIR_MARKER;
|
||||
pair.w = NEW_PAIR_MARKER;
|
||||
|
||||
int pairIndex = atomic_inc(out_numPairs);
|
||||
if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
|
||||
}
|
||||
|
||||
if(!isLeaf) //Internal node
|
||||
{
|
||||
if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
|
||||
{
|
||||
//Error
|
||||
}
|
||||
else
|
||||
{
|
||||
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
|
||||
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//From rayCastKernels.cl
|
||||
typedef struct
|
||||
{
|
||||
float4 m_from;
|
||||
float4 m_to;
|
||||
} b3RayInfo;
|
||||
//From rayCastKernels.cl
|
||||
|
||||
b3Vector3 b3Vector3_normalize(b3Vector3 v)
|
||||
{
|
||||
b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};
|
||||
return normalize(normal); //OpenCL normalize == vector4 normalize
|
||||
}
|
||||
b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
|
||||
b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
|
||||
|
||||
int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)
|
||||
{
|
||||
//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).
|
||||
//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.
|
||||
//
|
||||
//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane
|
||||
//and min.x will be the far plane; otherwise, it is reversed.
|
||||
//
|
||||
//In order for there to be a collision, the t_min and t_max of each pair must overlap.
|
||||
//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.
|
||||
|
||||
int4 isNegative = isless( rayNormalizedDirection, (b3Vector3){0.0f, 0.0f, 0.0f, 0.0f} ); //isless(x,y) returns (x < y)
|
||||
|
||||
//When using vector types, the select() function checks the most signficant bit,
|
||||
//but isless() sets the least significant bit.
|
||||
isNegative <<= 31;
|
||||
|
||||
//select(b, a, condition) == condition ? a : b
|
||||
//When using select() with vector types, (condition[i]) is true if its most significant bit is 1
|
||||
b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;
|
||||
b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;
|
||||
|
||||
b3Scalar t_min_final = 0.0f;
|
||||
b3Scalar t_max_final = rayLength;
|
||||
|
||||
//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned.
|
||||
//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])
|
||||
//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.
|
||||
t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );
|
||||
t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );
|
||||
|
||||
return (t_min_final <= t_max_final);
|
||||
}
|
||||
|
||||
__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,
|
||||
|
||||
__global int* rootNodeIndex,
|
||||
__global int2* internalNodeChildIndices,
|
||||
__global b3AabbCL* internalNodeAabbs,
|
||||
__global int2* internalNodeLeafIndexRanges,
|
||||
__global SortDataCL* mortonCodesAndAabbIndices,
|
||||
|
||||
__global b3RayInfo* rays,
|
||||
|
||||
__global int* out_numRayRigidPairs,
|
||||
__global int2* out_rayRigidPairs,
|
||||
int maxRayRigidPairs, int numRays)
|
||||
{
|
||||
int rayIndex = get_global_id(0);
|
||||
if(rayIndex >= numRays) return;
|
||||
|
||||
//
|
||||
b3Vector3 rayFrom = rays[rayIndex].m_from;
|
||||
b3Vector3 rayTo = rays[rayIndex].m_to;
|
||||
b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
|
||||
b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
|
||||
|
||||
//
|
||||
int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
|
||||
|
||||
int stackSize = 1;
|
||||
stack[0] = *rootNodeIndex;
|
||||
|
||||
while(stackSize)
|
||||
{
|
||||
int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
|
||||
--stackSize;
|
||||
|
||||
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
|
||||
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
|
||||
|
||||
//bvhRigidIndex is not used if internal node
|
||||
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
|
||||
|
||||
b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
|
||||
if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )
|
||||
{
|
||||
if(isLeaf)
|
||||
{
|
||||
int2 rayRigidPair;
|
||||
rayRigidPair.x = rayIndex;
|
||||
rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
|
||||
|
||||
int pairIndex = atomic_inc(out_numRayRigidPairs);
|
||||
if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
|
||||
}
|
||||
|
||||
if(!isLeaf) //Internal node
|
||||
{
|
||||
if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
|
||||
{
|
||||
//Error
|
||||
}
|
||||
else
|
||||
{
|
||||
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
|
||||
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs,
|
||||
__global int* out_numPairs, __global int4* out_overlappingPairs,
|
||||
int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)
|
||||
{
|
||||
int smallAabbIndex = get_global_id(0);
|
||||
if(smallAabbIndex >= numSmallAabbRigids) return;
|
||||
|
||||
b3AabbCL smallAabb = smallAabbs[smallAabbIndex];
|
||||
for(int i = 0; i < numLargeAabbRigids; ++i)
|
||||
{
|
||||
b3AabbCL largeAabb = largeAabbs[i];
|
||||
if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )
|
||||
{
|
||||
int4 pair;
|
||||
pair.x = largeAabb.m_minIndices[3];
|
||||
pair.y = smallAabb.m_minIndices[3];
|
||||
pair.z = NEW_PAIR_MARKER;
|
||||
pair.w = NEW_PAIR_MARKER;
|
||||
|
||||
int pairIndex = atomic_inc(out_numPairs);
|
||||
if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
|
||||
}
|
||||
}
|
||||
}
|
||||
__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,
|
||||
__global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,
|
||||
int numLargeAabbRigids, int maxRayRigidPairs, int numRays)
|
||||
{
|
||||
int rayIndex = get_global_id(0);
|
||||
if(rayIndex >= numRays) return;
|
||||
|
||||
b3Vector3 rayFrom = rays[rayIndex].m_from;
|
||||
b3Vector3 rayTo = rays[rayIndex].m_to;
|
||||
b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
|
||||
b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
|
||||
|
||||
for(int i = 0; i < numLargeAabbRigids; ++i)
|
||||
{
|
||||
b3AabbCL rigidAabb = largeRigidAabbs[i];
|
||||
if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )
|
||||
{
|
||||
int2 rayRigidPair;
|
||||
rayRigidPair.x = rayIndex;
|
||||
rayRigidPair.y = rigidAabb.m_minIndices[3];
|
||||
|
||||
int pairIndex = atomic_inc(out_numRayRigidPairs);
|
||||
if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.
|
||||
//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.
|
||||
//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.
|
||||
//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).
|
||||
#define B3_PLBVH_INVALID_COMMON_PREFIX 128
|
||||
|
||||
#define B3_PLBVH_ROOT_NODE_MARKER -1
|
||||
|
||||
#define b3Int64 long
|
||||
|
||||
int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }
|
||||
b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j)
|
||||
{
|
||||
//This function only needs to return (i & j) in order for the algorithm to work,
|
||||
//but it may help with debugging to mask out the lower bits.
|
||||
|
||||
b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);
|
||||
|
||||
b3Int64 sharedBits = i & j;
|
||||
b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0
|
||||
|
||||
return sharedBits & bitmask;
|
||||
}
|
||||
|
||||
//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths
|
||||
int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)
|
||||
{
|
||||
return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );
|
||||
}
|
||||
|
||||
__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,
|
||||
__global b3Int64* out_commonPrefixes,
|
||||
__global int* out_commonPrefixLengths,
|
||||
int numInternalNodes)
|
||||
{
|
||||
int internalNodeIndex = get_global_id(0);
|
||||
if (internalNodeIndex >= numInternalNodes) return;
|
||||
|
||||
//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,
|
||||
//and the number of internal nodes is always numLeafNodes - 1
|
||||
int leftLeafIndex = internalNodeIndex;
|
||||
int rightLeafIndex = internalNodeIndex + 1;
|
||||
|
||||
int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;
|
||||
int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;
|
||||
|
||||
//Binary radix tree construction algorithm does not work if there are duplicate morton codes.
|
||||
//Append the index of each leaf node to each morton code so that there are no duplicates.
|
||||
//The algorithm also requires that the morton codes are sorted in ascending order; this requirement
|
||||
//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.
|
||||
//
|
||||
//upsample(a, b) == ( ((b3Int64)a) << 32) | b
|
||||
b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);
|
||||
b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);
|
||||
|
||||
out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
|
||||
out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
|
||||
}
|
||||
|
||||
|
||||
__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,
|
||||
__global int2* out_childNodes, int numLeafNodes)
|
||||
{
|
||||
int leafNodeIndex = get_global_id(0);
|
||||
if (leafNodeIndex >= numLeafNodes) return;
|
||||
|
||||
int numInternalNodes = numLeafNodes - 1;
|
||||
|
||||
int leftSplitIndex = leafNodeIndex - 1;
|
||||
int rightSplitIndex = leafNodeIndex;
|
||||
|
||||
int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
|
||||
int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
|
||||
|
||||
//Parent node is the highest adjacent common prefix that is lower than the node's common prefix
|
||||
//Leaf nodes are considered as having the highest common prefix
|
||||
int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);
|
||||
|
||||
//Handle cases for the edge nodes; the first and last node
|
||||
//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX
|
||||
if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;
|
||||
if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;
|
||||
|
||||
int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;
|
||||
out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;
|
||||
|
||||
int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa
|
||||
|
||||
//out_childNodesAsInt[0] == int2.x == left child
|
||||
//out_childNodesAsInt[1] == int2.y == right child
|
||||
int isLeaf = 1;
|
||||
__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
|
||||
out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);
|
||||
}
|
||||
|
||||
__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,
|
||||
__global int2* out_childNodes,
|
||||
__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,
|
||||
int numInternalNodes)
|
||||
{
|
||||
int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
|
||||
if(internalNodeIndex >= numInternalNodes) return;
|
||||
|
||||
b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];
|
||||
int nodePrefixLength = commonPrefixLengths[internalNodeIndex];
|
||||
|
||||
//#define USE_LINEAR_SEARCH
|
||||
#ifdef USE_LINEAR_SEARCH
|
||||
int leftIndex = -1;
|
||||
int rightIndex = -1;
|
||||
|
||||
//Find nearest element to left with a lower common prefix
|
||||
for(int i = internalNodeIndex - 1; i >= 0; --i)
|
||||
{
|
||||
int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
|
||||
if(nodeLeftSharedPrefixLength < nodePrefixLength)
|
||||
{
|
||||
leftIndex = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//Find nearest element to right with a lower common prefix
|
||||
for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)
|
||||
{
|
||||
int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
|
||||
if(nodeRightSharedPrefixLength < nodePrefixLength)
|
||||
{
|
||||
rightIndex = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#else //Use binary search
|
||||
|
||||
//Find nearest element to left with a lower common prefix
|
||||
int leftIndex = -1;
|
||||
{
|
||||
int lower = 0;
|
||||
int upper = internalNodeIndex - 1;
|
||||
|
||||
while(lower <= upper)
|
||||
{
|
||||
int mid = (lower + upper) / 2;
|
||||
b3Int64 midPrefix = commonPrefixes[mid];
|
||||
int midPrefixLength = commonPrefixLengths[mid];
|
||||
|
||||
int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
|
||||
if(nodeMidSharedPrefixLength < nodePrefixLength)
|
||||
{
|
||||
int right = mid + 1;
|
||||
if(right < internalNodeIndex)
|
||||
{
|
||||
b3Int64 rightPrefix = commonPrefixes[right];
|
||||
int rightPrefixLength = commonPrefixLengths[right];
|
||||
|
||||
int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);
|
||||
if(nodeRightSharedPrefixLength < nodePrefixLength)
|
||||
{
|
||||
lower = right;
|
||||
leftIndex = right;
|
||||
}
|
||||
else
|
||||
{
|
||||
leftIndex = mid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
leftIndex = mid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else upper = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
//Find nearest element to right with a lower common prefix
|
||||
int rightIndex = -1;
|
||||
{
|
||||
int lower = internalNodeIndex + 1;
|
||||
int upper = numInternalNodes - 1;
|
||||
|
||||
while(lower <= upper)
|
||||
{
|
||||
int mid = (lower + upper) / 2;
|
||||
b3Int64 midPrefix = commonPrefixes[mid];
|
||||
int midPrefixLength = commonPrefixLengths[mid];
|
||||
|
||||
int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
|
||||
if(nodeMidSharedPrefixLength < nodePrefixLength)
|
||||
{
|
||||
int left = mid - 1;
|
||||
if(left > internalNodeIndex)
|
||||
{
|
||||
b3Int64 leftPrefix = commonPrefixes[left];
|
||||
int leftPrefixLength = commonPrefixLengths[left];
|
||||
|
||||
int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);
|
||||
if(nodeLeftSharedPrefixLength < nodePrefixLength)
|
||||
{
|
||||
upper = left;
|
||||
rightIndex = left;
|
||||
}
|
||||
else
|
||||
{
|
||||
rightIndex = mid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
rightIndex = mid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else lower = mid + 1;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//Select parent
|
||||
{
|
||||
int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
|
||||
int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
|
||||
|
||||
int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);
|
||||
|
||||
if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;
|
||||
else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;
|
||||
|
||||
int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;
|
||||
|
||||
int isRootNode = (leftIndex == -1 && rightIndex == -1);
|
||||
out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;
|
||||
|
||||
int isLeaf = 0;
|
||||
if(!isRootNode)
|
||||
{
|
||||
int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa
|
||||
|
||||
//out_childNodesAsInt[0] == int2.x == left child
|
||||
//out_childNodesAsInt[1] == int2.y == right child
|
||||
__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
|
||||
out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
|
||||
}
|
||||
else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,
|
||||
__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)
|
||||
{
|
||||
if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);
|
||||
|
||||
int internalNodeIndex = get_global_id(0);
|
||||
if(internalNodeIndex >= numInternalNodes) return;
|
||||
|
||||
//
|
||||
int distanceFromRoot = 0;
|
||||
{
|
||||
int parentIndex = internalNodeParentNodes[internalNodeIndex];
|
||||
while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)
|
||||
{
|
||||
parentIndex = internalNodeParentNodes[parentIndex];
|
||||
++distanceFromRoot;
|
||||
}
|
||||
}
|
||||
out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;
|
||||
|
||||
//
|
||||
__local int localMaxDistanceFromRoot;
|
||||
if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);
|
||||
}
|
||||
|
||||
__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,
|
||||
__global int2* childNodes,
|
||||
__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,
|
||||
int maxDistanceFromRoot, int processedDistance, int numInternalNodes)
|
||||
{
|
||||
int internalNodeIndex = get_global_id(0);
|
||||
if(internalNodeIndex >= numInternalNodes) return;
|
||||
|
||||
int distance = distanceFromRoot[internalNodeIndex];
|
||||
|
||||
if(distance == processedDistance)
|
||||
{
|
||||
int leftChildIndex = childNodes[internalNodeIndex].x;
|
||||
int rightChildIndex = childNodes[internalNodeIndex].y;
|
||||
|
||||
int isLeftChildLeaf = isLeafNode(leftChildIndex);
|
||||
int isRightChildLeaf = isLeafNode(rightChildIndex);
|
||||
|
||||
leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);
|
||||
rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);
|
||||
|
||||
//leftRigidIndex/rightRigidIndex is not used if internal node
|
||||
int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
|
||||
int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
|
||||
|
||||
b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];
|
||||
b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];
|
||||
|
||||
b3AabbCL mergedAabb;
|
||||
mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
|
||||
mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
|
||||
internalNodeAabbs[internalNodeIndex] = mergedAabb;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)
|
||||
{
|
||||
int internalNodeIndex = get_global_id(0);
|
||||
if(internalNodeIndex >= numInternalNodes) return;
|
||||
|
||||
int numLeafNodes = numInternalNodes + 1;
|
||||
|
||||
int2 childNodes = internalNodeChildNodes[internalNodeIndex];
|
||||
|
||||
int2 leafIndexRange; //x == min leaf index, y == max leaf index
|
||||
|
||||
//Find lowest leaf index covered by this internal node
|
||||
{
|
||||
int lowestIndex = childNodes.x; //childNodes.x == Left child
|
||||
while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;
|
||||
leafIndexRange.x = lowestIndex;
|
||||
}
|
||||
|
||||
//Find highest leaf index covered by this internal node
|
||||
{
|
||||
int highestIndex = childNodes.y; //childNodes.y == Right child
|
||||
while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;
|
||||
leafIndexRange.y = highestIndex;
|
||||
}
|
||||
|
||||
//
|
||||
out_leafIndexRanges[internalNodeIndex] = leafIndexRange;
|
||||
}
|
||||
@@ -0,0 +1,729 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* parallelLinearBvhCL= \
|
||||
"/*\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose,\n"
|
||||
"including commercial applications, and to alter it and redistribute it freely,\n"
|
||||
"subject to the following restrictions:\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Initial Author Jackson Lee, 2014\n"
|
||||
"typedef float b3Scalar;\n"
|
||||
"typedef float4 b3Vector3;\n"
|
||||
"#define b3Max max\n"
|
||||
"#define b3Min min\n"
|
||||
"#define b3Sqrt sqrt\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" unsigned int m_key;\n"
|
||||
" unsigned int m_value;\n"
|
||||
"} SortDataCL;\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"unsigned int interleaveBits(unsigned int x)\n"
|
||||
"{\n"
|
||||
" //........ ........ ......12 3456789A //x\n"
|
||||
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n"
|
||||
" \n"
|
||||
" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n"
|
||||
" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n"
|
||||
" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n"
|
||||
" \n"
|
||||
" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n"
|
||||
" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n"
|
||||
" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n"
|
||||
" \n"
|
||||
" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n"
|
||||
" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n"
|
||||
" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n"
|
||||
" \n"
|
||||
" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n"
|
||||
" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n"
|
||||
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n"
|
||||
" \n"
|
||||
" //........ ........ ......11 11111111 //0x000003FF\n"
|
||||
" x &= 0x000003FF; //Clear all bits above bit 10\n"
|
||||
" \n"
|
||||
" x = (x ^ (x << 16)) & 0xFF0000FF;\n"
|
||||
" x = (x ^ (x << 8)) & 0x0300F00F;\n"
|
||||
" x = (x ^ (x << 4)) & 0x030C30C3;\n"
|
||||
" x = (x ^ (x << 2)) & 0x09249249;\n"
|
||||
" \n"
|
||||
" return x;\n"
|
||||
"}\n"
|
||||
"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
|
||||
"{\n"
|
||||
" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
|
||||
"}\n"
|
||||
"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
|
||||
"{\n"
|
||||
" int separatedAabbIndex = get_global_id(0);\n"
|
||||
" if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
|
||||
" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
|
||||
" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
|
||||
"}\n"
|
||||
"//Should replace with an optimized parallel reduction\n"
|
||||
"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
|
||||
"{\n"
|
||||
" //Each time this kernel is added to the command queue, \n"
|
||||
" //the number of AABBs needing to be merged is halved\n"
|
||||
" //\n"
|
||||
" //Example with 159 AABBs:\n"
|
||||
" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
|
||||
" // numMergedAabbs == 159 - 80 == 79\n"
|
||||
" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
|
||||
" \n"
|
||||
" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
|
||||
" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
|
||||
" \n"
|
||||
" int aabbIndex = get_global_id(0);\n"
|
||||
" if(aabbIndex >= numMergedAabbs) return;\n"
|
||||
" \n"
|
||||
" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
|
||||
" \n"
|
||||
" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
|
||||
" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
|
||||
" \n"
|
||||
" b3AabbCL mergedAabb;\n"
|
||||
" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
|
||||
" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
|
||||
" out_mergedAabb[aabbIndex] = mergedAabb;\n"
|
||||
"}\n"
|
||||
"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
|
||||
" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
|
||||
"{\n"
|
||||
" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n"
|
||||
" if(leafNodeIndex >= numAabbs) return;\n"
|
||||
" \n"
|
||||
" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
|
||||
" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
|
||||
" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
|
||||
" \n"
|
||||
" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
|
||||
" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
|
||||
" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
|
||||
" \n"
|
||||
" //Quantize into integer coordinates\n"
|
||||
" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
|
||||
" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
|
||||
" \n"
|
||||
" int4 discretePosition;\n"
|
||||
" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
|
||||
" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
|
||||
" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
|
||||
" \n"
|
||||
" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
|
||||
" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
|
||||
" discretePosition += 512;\n"
|
||||
" \n"
|
||||
" //Interleave bits(assign a morton code, also known as a z-curve)\n"
|
||||
" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
|
||||
" \n"
|
||||
" //\n"
|
||||
" SortDataCL mortonCodeIndexPair;\n"
|
||||
" mortonCodeIndexPair.m_key = mortonCode;\n"
|
||||
" mortonCodeIndexPair.m_value = leafNodeIndex;\n"
|
||||
" \n"
|
||||
" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
|
||||
"}\n"
|
||||
"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
|
||||
"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
|
||||
"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
|
||||
"//In both cases, the bit should be cleared to access the actual node index.\n"
|
||||
"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
|
||||
"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
|
||||
"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
|
||||
"//From sap.cl\n"
|
||||
"#define NEW_PAIR_MARKER -1\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"//From sap.cl\n"
|
||||
"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
|
||||
" __global int* rootNodeIndex, \n"
|
||||
" __global int2* internalNodeChildIndices, \n"
|
||||
" __global b3AabbCL* internalNodeAabbs,\n"
|
||||
" __global int2* internalNodeLeafIndexRanges,\n"
|
||||
" \n"
|
||||
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
|
||||
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
|
||||
" int maxPairs, int numQueryAabbs)\n"
|
||||
"{\n"
|
||||
" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
|
||||
" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
|
||||
" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
|
||||
" if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
|
||||
" \n"
|
||||
" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
|
||||
" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
|
||||
" \n"
|
||||
" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
|
||||
" \n"
|
||||
" int stackSize = 1;\n"
|
||||
" stack[0] = *rootNodeIndex;\n"
|
||||
" \n"
|
||||
" while(stackSize)\n"
|
||||
" {\n"
|
||||
" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
|
||||
" --stackSize;\n"
|
||||
" \n"
|
||||
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
|
||||
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
|
||||
" \n"
|
||||
" //Optimization - if the BVH is structured as a binary radix tree, then\n"
|
||||
" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
|
||||
" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
|
||||
" {\n"
|
||||
" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
|
||||
" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" //bvhRigidIndex is not used if internal node\n"
|
||||
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
|
||||
" \n"
|
||||
" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
|
||||
" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
|
||||
" {\n"
|
||||
" if(isLeaf)\n"
|
||||
" {\n"
|
||||
" int4 pair;\n"
|
||||
" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
|
||||
" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
|
||||
" pair.z = NEW_PAIR_MARKER;\n"
|
||||
" pair.w = NEW_PAIR_MARKER;\n"
|
||||
" \n"
|
||||
" int pairIndex = atomic_inc(out_numPairs);\n"
|
||||
" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" if(!isLeaf) //Internal node\n"
|
||||
" {\n"
|
||||
" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
|
||||
" {\n"
|
||||
" //Error\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
|
||||
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"//From rayCastKernels.cl\n"
|
||||
"typedef struct\n"
|
||||
"{\n"
|
||||
" float4 m_from;\n"
|
||||
" float4 m_to;\n"
|
||||
"} b3RayInfo;\n"
|
||||
"//From rayCastKernels.cl\n"
|
||||
"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
|
||||
"{\n"
|
||||
" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
|
||||
" return normalize(normal); //OpenCL normalize == vector4 normalize\n"
|
||||
"}\n"
|
||||
"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
|
||||
"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
|
||||
"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
|
||||
"{\n"
|
||||
" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
|
||||
" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
|
||||
" //\n"
|
||||
" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
|
||||
" //and min.x will be the far plane; otherwise, it is reversed.\n"
|
||||
" //\n"
|
||||
" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
|
||||
" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
|
||||
" \n"
|
||||
" int4 isNegative = isless( rayNormalizedDirection, (b3Vector3){0.0f, 0.0f, 0.0f, 0.0f} ); //isless(x,y) returns (x < y)\n"
|
||||
" \n"
|
||||
" //When using vector types, the select() function checks the most signficant bit, \n"
|
||||
" //but isless() sets the least significant bit.\n"
|
||||
" isNegative <<= 31;\n"
|
||||
" //select(b, a, condition) == condition ? a : b\n"
|
||||
" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
|
||||
" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
|
||||
" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
|
||||
" \n"
|
||||
" b3Scalar t_min_final = 0.0f;\n"
|
||||
" b3Scalar t_max_final = rayLength;\n"
|
||||
" \n"
|
||||
" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
|
||||
" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
|
||||
" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
|
||||
" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
|
||||
" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
|
||||
" \n"
|
||||
" return (t_min_final <= t_max_final);\n"
|
||||
"}\n"
|
||||
"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
|
||||
" __global int* rootNodeIndex, \n"
|
||||
" __global int2* internalNodeChildIndices, \n"
|
||||
" __global b3AabbCL* internalNodeAabbs,\n"
|
||||
" __global int2* internalNodeLeafIndexRanges,\n"
|
||||
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
|
||||
" \n"
|
||||
" __global b3RayInfo* rays,\n"
|
||||
" \n"
|
||||
" __global int* out_numRayRigidPairs, \n"
|
||||
" __global int2* out_rayRigidPairs,\n"
|
||||
" int maxRayRigidPairs, int numRays)\n"
|
||||
"{\n"
|
||||
" int rayIndex = get_global_id(0);\n"
|
||||
" if(rayIndex >= numRays) return;\n"
|
||||
" \n"
|
||||
" //\n"
|
||||
" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
|
||||
" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
|
||||
" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
|
||||
" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
|
||||
" \n"
|
||||
" //\n"
|
||||
" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
|
||||
" \n"
|
||||
" int stackSize = 1;\n"
|
||||
" stack[0] = *rootNodeIndex;\n"
|
||||
" \n"
|
||||
" while(stackSize)\n"
|
||||
" {\n"
|
||||
" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
|
||||
" --stackSize;\n"
|
||||
" \n"
|
||||
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
|
||||
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
|
||||
" \n"
|
||||
" //bvhRigidIndex is not used if internal node\n"
|
||||
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
|
||||
" \n"
|
||||
" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
|
||||
" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n"
|
||||
" {\n"
|
||||
" if(isLeaf)\n"
|
||||
" {\n"
|
||||
" int2 rayRigidPair;\n"
|
||||
" rayRigidPair.x = rayIndex;\n"
|
||||
" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
|
||||
" \n"
|
||||
" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
|
||||
" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" if(!isLeaf) //Internal node\n"
|
||||
" {\n"
|
||||
" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
|
||||
" {\n"
|
||||
" //Error\n"
|
||||
" }\n"
|
||||
" else\n"
|
||||
" {\n"
|
||||
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
|
||||
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
|
||||
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
|
||||
" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
|
||||
"{\n"
|
||||
" int smallAabbIndex = get_global_id(0);\n"
|
||||
" if(smallAabbIndex >= numSmallAabbRigids) return;\n"
|
||||
" \n"
|
||||
" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
|
||||
" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
|
||||
" {\n"
|
||||
" b3AabbCL largeAabb = largeAabbs[i];\n"
|
||||
" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
|
||||
" {\n"
|
||||
" int4 pair;\n"
|
||||
" pair.x = largeAabb.m_minIndices[3];\n"
|
||||
" pair.y = smallAabb.m_minIndices[3];\n"
|
||||
" pair.z = NEW_PAIR_MARKER;\n"
|
||||
" pair.w = NEW_PAIR_MARKER;\n"
|
||||
" \n"
|
||||
" int pairIndex = atomic_inc(out_numPairs);\n"
|
||||
" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
|
||||
" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n"
|
||||
" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
|
||||
"{\n"
|
||||
" int rayIndex = get_global_id(0);\n"
|
||||
" if(rayIndex >= numRays) return;\n"
|
||||
" \n"
|
||||
" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
|
||||
" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
|
||||
" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
|
||||
" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
|
||||
" \n"
|
||||
" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
|
||||
" {\n"
|
||||
" b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
|
||||
" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
|
||||
" {\n"
|
||||
" int2 rayRigidPair;\n"
|
||||
" rayRigidPair.x = rayIndex;\n"
|
||||
" rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
|
||||
" \n"
|
||||
" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
|
||||
" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
|
||||
"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
|
||||
"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
|
||||
"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
|
||||
"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
|
||||
"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
|
||||
"#define b3Int64 long\n"
|
||||
"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
|
||||
"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
|
||||
"{\n"
|
||||
" //This function only needs to return (i & j) in order for the algorithm to work,\n"
|
||||
" //but it may help with debugging to mask out the lower bits.\n"
|
||||
" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
|
||||
" b3Int64 sharedBits = i & j;\n"
|
||||
" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n"
|
||||
" \n"
|
||||
" return sharedBits & bitmask;\n"
|
||||
"}\n"
|
||||
"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
|
||||
"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
|
||||
"{\n"
|
||||
" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
|
||||
"}\n"
|
||||
"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
|
||||
" __global b3Int64* out_commonPrefixes,\n"
|
||||
" __global int* out_commonPrefixLengths,\n"
|
||||
" int numInternalNodes)\n"
|
||||
"{\n"
|
||||
" int internalNodeIndex = get_global_id(0);\n"
|
||||
" if (internalNodeIndex >= numInternalNodes) return;\n"
|
||||
" \n"
|
||||
" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
|
||||
" //and the number of internal nodes is always numLeafNodes - 1\n"
|
||||
" int leftLeafIndex = internalNodeIndex;\n"
|
||||
" int rightLeafIndex = internalNodeIndex + 1;\n"
|
||||
" \n"
|
||||
" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
|
||||
" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
|
||||
" \n"
|
||||
" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
|
||||
" //Append the index of each leaf node to each morton code so that there are no duplicates.\n"
|
||||
" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
|
||||
" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
|
||||
" //\n"
|
||||
" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
|
||||
" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
|
||||
" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
|
||||
" \n"
|
||||
" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
|
||||
" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
|
||||
"}\n"
|
||||
"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
|
||||
" __global int2* out_childNodes, int numLeafNodes)\n"
|
||||
"{\n"
|
||||
" int leafNodeIndex = get_global_id(0);\n"
|
||||
" if (leafNodeIndex >= numLeafNodes) return;\n"
|
||||
" \n"
|
||||
" int numInternalNodes = numLeafNodes - 1;\n"
|
||||
" \n"
|
||||
" int leftSplitIndex = leafNodeIndex - 1;\n"
|
||||
" int rightSplitIndex = leafNodeIndex;\n"
|
||||
" \n"
|
||||
" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
|
||||
" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
|
||||
" \n"
|
||||
" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
|
||||
" //Leaf nodes are considered as having the highest common prefix\n"
|
||||
" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
|
||||
" \n"
|
||||
" //Handle cases for the edge nodes; the first and last node\n"
|
||||
" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
|
||||
" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
|
||||
" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
|
||||
" \n"
|
||||
" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
|
||||
" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
|
||||
" \n"
|
||||
" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n"
|
||||
" \n"
|
||||
" //out_childNodesAsInt[0] == int2.x == left child\n"
|
||||
" //out_childNodesAsInt[1] == int2.y == right child\n"
|
||||
" int isLeaf = 1;\n"
|
||||
" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
|
||||
" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
|
||||
"}\n"
|
||||
"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
|
||||
" __global int2* out_childNodes,\n"
|
||||
" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
|
||||
" int numInternalNodes)\n"
|
||||
"{\n"
|
||||
" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
|
||||
" if(internalNodeIndex >= numInternalNodes) return;\n"
|
||||
" \n"
|
||||
" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
|
||||
" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
|
||||
" \n"
|
||||
"//#define USE_LINEAR_SEARCH\n"
|
||||
"#ifdef USE_LINEAR_SEARCH\n"
|
||||
" int leftIndex = -1;\n"
|
||||
" int rightIndex = -1;\n"
|
||||
" \n"
|
||||
" //Find nearest element to left with a lower common prefix\n"
|
||||
" for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
|
||||
" {\n"
|
||||
" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
|
||||
" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
|
||||
" {\n"
|
||||
" leftIndex = i;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" //Find nearest element to right with a lower common prefix\n"
|
||||
" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
|
||||
" {\n"
|
||||
" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
|
||||
" if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
|
||||
" {\n"
|
||||
" rightIndex = i;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
"#else //Use binary search\n"
|
||||
" //Find nearest element to left with a lower common prefix\n"
|
||||
" int leftIndex = -1;\n"
|
||||
" {\n"
|
||||
" int lower = 0;\n"
|
||||
" int upper = internalNodeIndex - 1;\n"
|
||||
" \n"
|
||||
" while(lower <= upper)\n"
|
||||
" {\n"
|
||||
" int mid = (lower + upper) / 2;\n"
|
||||
" b3Int64 midPrefix = commonPrefixes[mid];\n"
|
||||
" int midPrefixLength = commonPrefixLengths[mid];\n"
|
||||
" \n"
|
||||
" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
|
||||
" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
|
||||
" {\n"
|
||||
" int right = mid + 1;\n"
|
||||
" if(right < internalNodeIndex)\n"
|
||||
" {\n"
|
||||
" b3Int64 rightPrefix = commonPrefixes[right];\n"
|
||||
" int rightPrefixLength = commonPrefixLengths[right];\n"
|
||||
" \n"
|
||||
" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
|
||||
" if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
|
||||
" {\n"
|
||||
" lower = right;\n"
|
||||
" leftIndex = right;\n"
|
||||
" }\n"
|
||||
" else \n"
|
||||
" {\n"
|
||||
" leftIndex = mid;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" else \n"
|
||||
" {\n"
|
||||
" leftIndex = mid;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" else upper = mid - 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" //Find nearest element to right with a lower common prefix\n"
|
||||
" int rightIndex = -1;\n"
|
||||
" {\n"
|
||||
" int lower = internalNodeIndex + 1;\n"
|
||||
" int upper = numInternalNodes - 1;\n"
|
||||
" \n"
|
||||
" while(lower <= upper)\n"
|
||||
" {\n"
|
||||
" int mid = (lower + upper) / 2;\n"
|
||||
" b3Int64 midPrefix = commonPrefixes[mid];\n"
|
||||
" int midPrefixLength = commonPrefixLengths[mid];\n"
|
||||
" \n"
|
||||
" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
|
||||
" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
|
||||
" {\n"
|
||||
" int left = mid - 1;\n"
|
||||
" if(left > internalNodeIndex)\n"
|
||||
" {\n"
|
||||
" b3Int64 leftPrefix = commonPrefixes[left];\n"
|
||||
" int leftPrefixLength = commonPrefixLengths[left];\n"
|
||||
" \n"
|
||||
" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
|
||||
" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
|
||||
" {\n"
|
||||
" upper = left;\n"
|
||||
" rightIndex = left;\n"
|
||||
" }\n"
|
||||
" else \n"
|
||||
" {\n"
|
||||
" rightIndex = mid;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" else \n"
|
||||
" {\n"
|
||||
" rightIndex = mid;\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" else lower = mid + 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"#endif\n"
|
||||
" \n"
|
||||
" //Select parent\n"
|
||||
" {\n"
|
||||
" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
|
||||
" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
|
||||
" \n"
|
||||
" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
|
||||
" \n"
|
||||
" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
|
||||
" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
|
||||
" \n"
|
||||
" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
|
||||
" \n"
|
||||
" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
|
||||
" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
|
||||
" \n"
|
||||
" int isLeaf = 0;\n"
|
||||
" if(!isRootNode)\n"
|
||||
" {\n"
|
||||
" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n"
|
||||
" \n"
|
||||
" //out_childNodesAsInt[0] == int2.x == left child\n"
|
||||
" //out_childNodesAsInt[1] == int2.y == right child\n"
|
||||
" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
|
||||
" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
|
||||
" }\n"
|
||||
" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
|
||||
" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
|
||||
"{\n"
|
||||
" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
|
||||
" int internalNodeIndex = get_global_id(0);\n"
|
||||
" if(internalNodeIndex >= numInternalNodes) return;\n"
|
||||
" \n"
|
||||
" //\n"
|
||||
" int distanceFromRoot = 0;\n"
|
||||
" {\n"
|
||||
" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
|
||||
" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
|
||||
" {\n"
|
||||
" parentIndex = internalNodeParentNodes[parentIndex];\n"
|
||||
" ++distanceFromRoot;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
|
||||
" \n"
|
||||
" //\n"
|
||||
" __local int localMaxDistanceFromRoot;\n"
|
||||
" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
|
||||
"}\n"
|
||||
"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
|
||||
" __global int2* childNodes,\n"
|
||||
" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
|
||||
" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
|
||||
"{\n"
|
||||
" int internalNodeIndex = get_global_id(0);\n"
|
||||
" if(internalNodeIndex >= numInternalNodes) return;\n"
|
||||
" \n"
|
||||
" int distance = distanceFromRoot[internalNodeIndex];\n"
|
||||
" \n"
|
||||
" if(distance == processedDistance)\n"
|
||||
" {\n"
|
||||
" int leftChildIndex = childNodes[internalNodeIndex].x;\n"
|
||||
" int rightChildIndex = childNodes[internalNodeIndex].y;\n"
|
||||
" \n"
|
||||
" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
|
||||
" int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
|
||||
" \n"
|
||||
" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
|
||||
" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
|
||||
" \n"
|
||||
" //leftRigidIndex/rightRigidIndex is not used if internal node\n"
|
||||
" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
|
||||
" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
|
||||
" \n"
|
||||
" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
|
||||
" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
|
||||
" \n"
|
||||
" b3AabbCL mergedAabb;\n"
|
||||
" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
|
||||
" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
|
||||
" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
|
||||
"{\n"
|
||||
" int internalNodeIndex = get_global_id(0);\n"
|
||||
" if(internalNodeIndex >= numInternalNodes) return;\n"
|
||||
" \n"
|
||||
" int numLeafNodes = numInternalNodes + 1;\n"
|
||||
" \n"
|
||||
" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
|
||||
" \n"
|
||||
" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n"
|
||||
" \n"
|
||||
" //Find lowest leaf index covered by this internal node\n"
|
||||
" {\n"
|
||||
" int lowestIndex = childNodes.x; //childNodes.x == Left child\n"
|
||||
" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
|
||||
" leafIndexRange.x = lowestIndex;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" //Find highest leaf index covered by this internal node\n"
|
||||
" {\n"
|
||||
" int highestIndex = childNodes.y; //childNodes.y == Right child\n"
|
||||
" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
|
||||
" leafIndexRange.y = highestIndex;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" //\n"
|
||||
" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
|
||||
"}\n"
|
||||
;
|
||||
@@ -8,6 +8,11 @@
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
|
||||
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h"
|
||||
|
||||
#include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"
|
||||
|
||||
|
||||
@@ -20,7 +25,24 @@ struct b3GpuRaycastInternalData
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_q;
|
||||
cl_kernel m_raytraceKernel;
|
||||
cl_kernel m_raytraceKernel;
|
||||
cl_kernel m_raytracePairsKernel;
|
||||
cl_kernel m_findRayRigidPairIndexRanges;
|
||||
|
||||
b3GpuParallelLinearBvh* m_plbvh;
|
||||
b3RadixSort32CL* m_radixSorter;
|
||||
b3FillCL* m_fill;
|
||||
|
||||
//1 element per ray
|
||||
b3OpenCLArray<b3RayInfo>* m_gpuRays;
|
||||
b3OpenCLArray<b3RayHit>* m_gpuHitResults;
|
||||
b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
|
||||
b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
|
||||
|
||||
//1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB
|
||||
b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
|
||||
b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index
|
||||
|
||||
int m_test;
|
||||
};
|
||||
|
||||
@@ -31,7 +53,19 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
|
||||
m_data->m_device = device;
|
||||
m_data->m_q = q;
|
||||
m_data->m_raytraceKernel = 0;
|
||||
m_data->m_raytracePairsKernel = 0;
|
||||
m_data->m_findRayRigidPairIndexRanges = 0;
|
||||
|
||||
m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
|
||||
m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
|
||||
m_data->m_fill = new b3FillCL(ctx, device, q);
|
||||
|
||||
m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
|
||||
m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
|
||||
m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
|
||||
m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray<int>(ctx, q);
|
||||
m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray<int>(ctx, q);
|
||||
m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);
|
||||
|
||||
{
|
||||
cl_int errNum=0;
|
||||
@@ -39,6 +73,10 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
clReleaseProgram(prog);
|
||||
}
|
||||
|
||||
@@ -48,6 +86,20 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
|
||||
b3GpuRaycast::~b3GpuRaycast()
|
||||
{
|
||||
clReleaseKernel(m_data->m_raytraceKernel);
|
||||
clReleaseKernel(m_data->m_raytracePairsKernel);
|
||||
clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
|
||||
|
||||
delete m_data->m_plbvh;
|
||||
delete m_data->m_radixSorter;
|
||||
delete m_data->m_fill;
|
||||
|
||||
delete m_data->m_gpuRays;
|
||||
delete m_data->m_gpuHitResults;
|
||||
delete m_data->m_firstRayRigidPairIndexPerRay;
|
||||
delete m_data->m_numRayRigidPairsPerRay;
|
||||
delete m_data->m_gpuNumRayRigidPairs;
|
||||
delete m_data->m_gpuRayRigidPairs;
|
||||
|
||||
delete m_data;
|
||||
}
|
||||
|
||||
@@ -206,27 +258,32 @@ void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3A
|
||||
}
|
||||
///todo: add some acceleration structure (AABBs, tree etc)
|
||||
void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
|
||||
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
|
||||
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
|
||||
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase)
|
||||
{
|
||||
|
||||
//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);
|
||||
|
||||
B3_PROFILE("castRaysGPU");
|
||||
|
||||
b3OpenCLArray<b3RayInfo> gpuRays(m_data->m_context,m_data->m_q);
|
||||
b3OpenCLArray<b3RayHit> gpuHitResults(m_data->m_context,m_data->m_q);
|
||||
|
||||
|
||||
{
|
||||
B3_PROFILE("raycast copyFromHost");
|
||||
gpuRays.copyFromHost(rays);
|
||||
|
||||
|
||||
gpuHitResults.resize(hitResults.size());
|
||||
gpuHitResults.copyFromHost(hitResults);
|
||||
m_data->m_gpuRays->copyFromHost(rays);
|
||||
m_data->m_gpuHitResults->copyFromHost(hitResults);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
int numRays = hitResults.size();
|
||||
{
|
||||
m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
|
||||
m_data->m_numRayRigidPairsPerRay->resize(numRays);
|
||||
|
||||
m_data->m_gpuNumRayRigidPairs->resize(1);
|
||||
m_data->m_gpuRayRigidPairs->resize(numRays * 16);
|
||||
}
|
||||
|
||||
//run kernel
|
||||
const bool USE_BRUTE_FORCE_RAYCAST = false;
|
||||
if(USE_BRUTE_FORCE_RAYCAST)
|
||||
{
|
||||
B3_PROFILE("raycast launch1D");
|
||||
|
||||
@@ -234,8 +291,8 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3Align
|
||||
int numRays = rays.size();
|
||||
launcher.setConst(numRays);
|
||||
|
||||
launcher.setBuffer(gpuRays.getBufferCL());
|
||||
launcher.setBuffer(gpuHitResults.getBufferCL());
|
||||
launcher.setBuffer(m_data->m_gpuRays->getBufferCL());
|
||||
launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL());
|
||||
|
||||
launcher.setConst(numBodies);
|
||||
launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL());
|
||||
@@ -246,11 +303,89 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3Align
|
||||
launcher.launch1D(numRays);
|
||||
clFinish(m_data->m_q);
|
||||
}
|
||||
else
|
||||
{
|
||||
m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() );
|
||||
|
||||
m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
|
||||
|
||||
int numRayRigidPairs = -1;
|
||||
m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
|
||||
if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() )
|
||||
{
|
||||
numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
|
||||
m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
|
||||
}
|
||||
|
||||
m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct
|
||||
|
||||
//Sort ray-rigid pairs by ray index
|
||||
{
|
||||
B3_PROFILE("sort ray-rigid pairs");
|
||||
m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) );
|
||||
}
|
||||
|
||||
//detect start,count of each ray pair
|
||||
{
|
||||
B3_PROFILE("detect ray-rigid pair index ranges");
|
||||
|
||||
{
|
||||
B3_PROFILE("reset ray-rigid pair index ranges");
|
||||
|
||||
m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index
|
||||
m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
|
||||
clFinish(m_data->m_q);
|
||||
}
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numRayRigidPairs);
|
||||
|
||||
launcher.launch1D(numRayRigidPairs);
|
||||
clFinish(m_data->m_q);
|
||||
}
|
||||
|
||||
{
|
||||
B3_PROFILE("ray-rigid intersection");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
|
||||
b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ),
|
||||
b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ),
|
||||
b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ),
|
||||
b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() )
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst(numRays);
|
||||
|
||||
launcher.launch1D(numRays);
|
||||
clFinish(m_data->m_q);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
//copy results
|
||||
{
|
||||
B3_PROFILE("raycast copyToHost");
|
||||
gpuHitResults.copyToHost(hitResults);
|
||||
m_data->m_gpuHitResults->copyToHost(hitResults);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -23,8 +23,7 @@ public:
|
||||
|
||||
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
|
||||
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
|
||||
const struct b3GpuNarrowPhaseInternalData* narrowphaseData
|
||||
);
|
||||
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -337,3 +337,103 @@ __kernel void rayCastKernel(
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs,
|
||||
__global int* out_firstRayRigidPairIndexPerRay,
|
||||
__global int* out_numRayRigidPairsPerRay,
|
||||
int numRayRigidPairs)
|
||||
{
|
||||
int rayRigidPairIndex = get_global_id(0);
|
||||
if (rayRigidPairIndex >= numRayRigidPairs) return;
|
||||
|
||||
int rayIndex = rayRigidPairs[rayRigidPairIndex].x;
|
||||
|
||||
atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);
|
||||
atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);
|
||||
}
|
||||
|
||||
__kernel void rayCastPairsKernel(const __global b3RayInfo* rays,
|
||||
__global b3RayHit* hitResults,
|
||||
__global int* firstRayRigidPairIndexPerRay,
|
||||
__global int* numRayRigidPairsPerRay,
|
||||
|
||||
__global Body* bodies,
|
||||
__global Collidable* collidables,
|
||||
__global const b3GpuFace* faces,
|
||||
__global const ConvexPolyhedronCL* convexShapes,
|
||||
|
||||
__global int2* rayRigidPairs,
|
||||
int numRays)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i >= numRays) return;
|
||||
|
||||
float4 rayFrom = rays[i].m_from;
|
||||
float4 rayTo = rays[i].m_to;
|
||||
|
||||
hitResults[i].m_hitFraction = 1.f;
|
||||
|
||||
float hitFraction = 1.f;
|
||||
float4 hitPoint;
|
||||
float4 hitNormal;
|
||||
int hitBodyIndex = -1;
|
||||
|
||||
//
|
||||
for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)
|
||||
{
|
||||
int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];
|
||||
int b = rayRigidPairs[rayRigidPairIndex].y;
|
||||
|
||||
if (hitResults[i].m_hitResult2 == b) continue;
|
||||
|
||||
Body body = bodies[b];
|
||||
Collidable rigidCollidable = collidables[body.m_collidableIdx];
|
||||
|
||||
float4 pos = body.m_pos;
|
||||
float4 orn = body.m_quat;
|
||||
|
||||
if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)
|
||||
{
|
||||
float4 invPos = (float4)(0,0,0,0);
|
||||
float4 invOrn = (float4)(0,0,0,0);
|
||||
float4 rayFromLocal = (float4)(0,0,0,0);
|
||||
float4 rayToLocal = (float4)(0,0,0,0);
|
||||
invOrn = qtInvert(orn);
|
||||
invPos = qtRotate(invOrn, -pos);
|
||||
rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
|
||||
rayToLocal = qtRotate( invOrn, rayTo) + invPos;
|
||||
rayFromLocal.w = 0.f;
|
||||
rayToLocal.w = 0.f;
|
||||
int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;
|
||||
int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;
|
||||
|
||||
if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
|
||||
{
|
||||
hitBodyIndex = b;
|
||||
hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
|
||||
}
|
||||
}
|
||||
|
||||
if (rigidCollidable.m_shapeType == SHAPE_SPHERE)
|
||||
{
|
||||
float radius = rigidCollidable.m_radius;
|
||||
|
||||
if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))
|
||||
{
|
||||
hitBodyIndex = b;
|
||||
hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
|
||||
hitNormal = (float4) (hitPoint - bodies[b].m_pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hitBodyIndex >= 0)
|
||||
{
|
||||
hitResults[i].m_hitFraction = hitFraction;
|
||||
hitResults[i].m_hitPoint = hitPoint;
|
||||
hitResults[i].m_hitNormal = normalize(hitNormal);
|
||||
hitResults[i].m_hitResult0 = hitBodyIndex;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -281,4 +281,101 @@ static const char* rayCastKernelCL= \
|
||||
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
|
||||
" __global int* out_firstRayRigidPairIndexPerRay,\n"
|
||||
" __global int* out_numRayRigidPairsPerRay,\n"
|
||||
" int numRayRigidPairs)\n"
|
||||
"{\n"
|
||||
" int rayRigidPairIndex = get_global_id(0);\n"
|
||||
" if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
|
||||
" \n"
|
||||
" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
|
||||
" \n"
|
||||
" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
|
||||
" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
|
||||
"}\n"
|
||||
"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
|
||||
" __global b3RayHit* hitResults, \n"
|
||||
" __global int* firstRayRigidPairIndexPerRay,\n"
|
||||
" __global int* numRayRigidPairsPerRay,\n"
|
||||
" \n"
|
||||
" __global Body* bodies,\n"
|
||||
" __global Collidable* collidables,\n"
|
||||
" __global const b3GpuFace* faces,\n"
|
||||
" __global const ConvexPolyhedronCL* convexShapes,\n"
|
||||
" \n"
|
||||
" __global int2* rayRigidPairs,\n"
|
||||
" int numRays)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i >= numRays) return;\n"
|
||||
" \n"
|
||||
" float4 rayFrom = rays[i].m_from;\n"
|
||||
" float4 rayTo = rays[i].m_to;\n"
|
||||
" \n"
|
||||
" hitResults[i].m_hitFraction = 1.f;\n"
|
||||
" \n"
|
||||
" float hitFraction = 1.f;\n"
|
||||
" float4 hitPoint;\n"
|
||||
" float4 hitNormal;\n"
|
||||
" int hitBodyIndex = -1;\n"
|
||||
" \n"
|
||||
" //\n"
|
||||
" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
|
||||
" {\n"
|
||||
" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
|
||||
" int b = rayRigidPairs[rayRigidPairIndex].y;\n"
|
||||
" \n"
|
||||
" if (hitResults[i].m_hitResult2 == b) continue;\n"
|
||||
" \n"
|
||||
" Body body = bodies[b];\n"
|
||||
" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
|
||||
" \n"
|
||||
" float4 pos = body.m_pos;\n"
|
||||
" float4 orn = body.m_quat;\n"
|
||||
" \n"
|
||||
" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
|
||||
" {\n"
|
||||
" float4 invPos = (float4)(0,0,0,0);\n"
|
||||
" float4 invOrn = (float4)(0,0,0,0);\n"
|
||||
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
|
||||
" float4 rayToLocal = (float4)(0,0,0,0);\n"
|
||||
" invOrn = qtInvert(orn);\n"
|
||||
" invPos = qtRotate(invOrn, -pos);\n"
|
||||
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
|
||||
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
|
||||
" rayFromLocal.w = 0.f;\n"
|
||||
" rayToLocal.w = 0.f;\n"
|
||||
" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
|
||||
" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
|
||||
" \n"
|
||||
" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
|
||||
" {\n"
|
||||
" hitBodyIndex = b;\n"
|
||||
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
|
||||
" {\n"
|
||||
" float radius = rigidCollidable.m_radius;\n"
|
||||
" \n"
|
||||
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
|
||||
" {\n"
|
||||
" hitBodyIndex = b;\n"
|
||||
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
|
||||
" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" if (hitBodyIndex >= 0)\n"
|
||||
" {\n"
|
||||
" hitResults[i].m_hitFraction = hitFraction;\n"
|
||||
" hitResults[i].m_hitPoint = hitPoint;\n"
|
||||
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
|
||||
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
;
|
||||
|
||||
@@ -703,6 +703,6 @@ void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& ray
|
||||
{
|
||||
this->m_data->m_raycaster->castRays(rays,hitResults,
|
||||
getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(),
|
||||
m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getInternalData()
|
||||
);
|
||||
m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
|
||||
m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user