Merge pull request #160 from rtrius/plbvh_raycast

Plbvh raycast
This commit is contained in:
erwincoumans
2014-04-09 11:14:34 -07:00
21 changed files with 2870 additions and 29 deletions

View File

@@ -9,6 +9,7 @@
#include "OpenGLWindow/b3gWindowInterface.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h"
#include "../GpuDemoInternalData.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
@@ -108,6 +109,7 @@ static int curSelectedBroadphase = 0;
static BroadphaseEntry allBroadphases[]=
{
{"Gpu Grid",b3GpuGridBroadphase::CreateFunc},
{"Parallel Linear BVH",b3GpuParallelLinearBvhBroadphase::CreateFunc},
{"CPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceCpu},
{"GPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceGpu},
{"GPU 1-SAP Original",b3GpuSapBroadphase::CreateFuncOriginal},
@@ -119,6 +121,7 @@ static BroadphaseEntry allBroadphases[]=
struct PairBenchInternalData
{
b3GpuBroadphaseInterface* m_broadphaseGPU;
b3GpuBroadphaseInterface* m_validationBroadphase;
cl_kernel m_moveObjectsKernel;
cl_kernel m_sineWaveKernel;
@@ -154,6 +157,8 @@ PairBench::PairBench()
m_window(0)
{
m_data = new PairBenchInternalData;
m_data->m_validationBroadphase = 0;
}
PairBench::~PairBench()
{
@@ -505,7 +510,16 @@ void PairBench::initPhysics(const ConstructionInfo& ci)
m_data->m_sineWaveKernel = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"sineWaveKernel",&errNum,pairBenchProg);
m_data->m_colorPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"colorPairsKernel2",&errNum,pairBenchProg);
m_data->m_updateAabbSimple = b3OpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,pairsKernelsCL,"updateAabbSimple",&errNum,pairBenchProg);
//Method for validating the overlapping pairs requires that the
//reference broadphase does not maintain internal state aside from AABB data.
//That is, overwriting the AABB state in the broadphase using
// b3GpuBroadphaseInterface::getAllAabbsGPU(),
// b3GpuBroadphaseInterface::getSmallAabbIndicesGPU(), and
// b3GpuBroadphaseInterface::getLargeAabbIndicesGPU()
//and then calling b3GpuBroadphaseInterface::calculateOverlappingPairs() should
//always produce the same result regardless of the current state of the broadphase.
m_data->m_validationBroadphase = b3GpuParallelLinearBvhBroadphase::CreateFunc(m_clData->m_clContext,m_clData->m_clDevice,m_clData->m_clQueue);
}
if (ci.m_window)
@@ -741,6 +755,12 @@ void PairBench::deleteBroadphase()
void PairBench::exitPhysics()
{
if(m_data->m_validationBroadphase)
{
delete m_data->m_validationBroadphase;
m_data->m_validationBroadphase = 0;
}
#ifdef B3_USE_MIDI
if (m_data->m_midiIn)
{
@@ -768,6 +788,17 @@ void PairBench::renderScene()
m_instancingRenderer->renderScene();
}
struct OverlappingPairSortPredicate
{
inline bool operator() (const b3Int4& a, const b3Int4& b) const
{
if(a.x != b.x) return (a.x < b.x);
if(a.y != b.y) return (a.y < b.y);
if(a.z != b.z) return (a.z < b.z);
return (a.w < b.w);
}
};
void PairBench::clientMoveAndDisplay()
{
//color all objects blue
@@ -901,7 +932,10 @@ void PairBench::clientMoveAndDisplay()
}
}
int prealloc = 3*1024*1024;
int maxOverlap = b3Min(prealloc,16*numObjects);
unsigned long dt = 0;
if (numObjects)
{
@@ -910,16 +944,104 @@ void PairBench::clientMoveAndDisplay()
B3_PROFILE("calculateOverlappingPairs");
int sz = sizeof(b3Int4)*64*numObjects;
int prealloc = 3*1024*1024;
int maxOverlap = b3Min(prealloc,16*numObjects);
m_data->m_broadphaseGPU->calculateOverlappingPairs(maxOverlap);
int numPairs = m_data->m_broadphaseGPU->getNumOverlap();
//printf("numPairs = %d\n", numPairs);
dt = cl.getTimeMicroseconds()-dt;
}
const bool VALIDATE_BROADPHASE = false; //Check that overlapping pairs of 2 broadphases are the same
if(numObjects && VALIDATE_BROADPHASE)
{
B3_PROFILE("validate broadphases");
{
B3_PROFILE("calculateOverlappingPairs m_validationBroadphase");
//m_data->m_validationBroadphase->getAllAabbsCPU() = m_data->m_broadphaseGPU->getAllAabbsCPU();
m_data->m_validationBroadphase->getAllAabbsGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getAllAabbsGPU() );
m_data->m_validationBroadphase->getSmallAabbIndicesGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getSmallAabbIndicesGPU() );
m_data->m_validationBroadphase->getLargeAabbIndicesGPU().copyFromOpenCLArray( m_data->m_broadphaseGPU->getLargeAabbIndicesGPU() );
m_data->m_validationBroadphase->calculateOverlappingPairs(maxOverlap);
}
static b3AlignedObjectArray<b3Int4> overlappingPairs;
static b3AlignedObjectArray<b3Int4> overlappingPairsReference;
m_data->m_broadphaseGPU->getOverlappingPairsGPU().copyToHost(overlappingPairs);
m_data->m_validationBroadphase->getOverlappingPairsGPU().copyToHost(overlappingPairsReference);
//Reorder pairs so that (pair.x < pair.y) is always true
{
B3_PROFILE("reorder pairs");
for(int i = 0; i < overlappingPairs.size(); ++i)
{
b3Int4 pair = overlappingPairs[i];
if(pair.x > pair.y)
{
b3Swap(pair.x, pair.y);
b3Swap(pair.z, pair.w);
overlappingPairs[i] = pair;
}
}
for(int i = 0; i < overlappingPairsReference.size(); ++i)
{
b3Int4 pair = overlappingPairsReference[i];
if(pair.x > pair.y)
{
b3Swap(pair.x, pair.y);
b3Swap(pair.z, pair.w);
overlappingPairsReference[i] = pair;
}
}
}
//
{
B3_PROFILE("Sort overlapping pairs from most to least significant bit");
overlappingPairs.quickSort( OverlappingPairSortPredicate() );
overlappingPairsReference.quickSort( OverlappingPairSortPredicate() );
}
//Compare
{
B3_PROFILE("compare pairs");
int numPairs = overlappingPairs.size();
int numPairsReference = overlappingPairsReference.size();
bool success = true;
if(numPairs == numPairsReference)
{
for(int i = 0; i < numPairsReference; ++i)
{
const b3Int4& pairA = overlappingPairs[i];
const b3Int4& pairB = overlappingPairsReference[i];
if( pairA.x != pairB.x
|| pairA.y != pairB.y
|| pairA.z != pairB.z
|| pairA.w != pairB.w )
{
b3Error("Error: one or more overlappingPairs differs from reference.\n");
success = false;
break;
}
}
}
else
{
b3Error("Error: numPairs %d != numPairsReference %d \n", numPairs, numPairsReference);
success = false;
}
printf("Broadphase validation: %d \n", success);
}
}
if (m_data->m_gui)
{

View File

@@ -187,6 +187,10 @@ void GpuRaytraceScene::renderScene()
void GpuRaytraceScene::renderScene2()
{
//If using the BVH to accelerate raycasting, the AABBs need to be updated or else they will
//not match the actual rigid body positions after integration. The result is that rigid bodies
//are not drawn or appear clipped, especially if they are moving quickly.
m_data->m_rigidBodyPipeline->setupGpuAabbsFull();
// GpuBoxPlaneScene::renderScene();
// return;
@@ -308,7 +312,7 @@ void GpuRaytraceScene::renderScene2()
{
B3_PROFILE("cast primary rays");
//m_raycaster->castRaysHost(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(),m_data->m_np->getInternalData());
m_raycaster->castRays(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData());
m_raycaster->castRays(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData(), m_data->m_bp);
}
@@ -350,7 +354,7 @@ void GpuRaytraceScene::renderScene2()
{
B3_PROFILE("cast shadow rays");
//m_raycaster->castRaysHost(primaryRays, hits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu());
m_raycaster->castRays(shadowRays, shadowHits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData());
m_raycaster->castRays(shadowRays, shadowHits, this->m_data->m_np->getNumRigidBodies(), m_data->m_np->getBodiesCpu(), m_data->m_np->getNumCollidablesGpu(), m_data->m_np->getCollidablesCpu(), m_data->m_np->getInternalData(), m_data->m_bp);
}
{

View File

@@ -13,6 +13,7 @@ premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/Broadphas
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
premake4 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify

View File

@@ -14,6 +14,7 @@ rem @echo off
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h" --stringname="gridBroadphaseCL" stringify
./premake4_linux64 --file=stringifyKernel.lua --kernelfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl" --headerfile="../src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h" --stringname="parallelLinearBvhCL" stringify

Binary file not shown.

View File

@@ -34,6 +34,10 @@ public:
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0;
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0;
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
};

View File

@@ -366,4 +366,18 @@ b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
{
return m_allAabbsCPU1;
}
}
b3OpenCLArray<b3Int4>& b3GpuGridBroadphase::getOverlappingPairsGPU()
{
return m_gpuPairs;
}
b3OpenCLArray<int>& b3GpuGridBroadphase::getSmallAabbIndicesGPU()
{
return m_smallAabbsMappingGPU;
}
b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
{
return m_largeAabbsMappingGPU;
}

View File

@@ -78,6 +78,10 @@ public:
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};

View File

@@ -0,0 +1,577 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "b3GpuParallelLinearBvh.h"
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
m_queue(queue),
m_radixSorter(context, device, queue),
m_rootNodeIndex(context, queue),
m_maxDistanceFromRoot(context, queue),
m_temp(context, queue),
m_internalNodeAabbs(context, queue),
m_internalNodeLeafIndexRanges(context, queue),
m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue),
m_commonPrefixes(context, queue),
m_commonPrefixLengths(context, queue),
m_distanceFromRoot(context, queue),
m_leafNodeParentNodes(context, queue),
m_mortonCodesAndAabbIndicies(context, queue),
m_mergedAabb(context, queue),
m_leafNodeAabbs(context, queue),
m_largeAabbs(context, queue)
{
m_rootNodeIndex.resize(1);
m_maxDistanceFromRoot.resize(1);
m_temp.resize(1);
//
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
cl_int error;
char* additionalMacros = 0;
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
b3Assert(m_parallelLinearBvhProgram);
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_separateAabbsKernel);
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_findAllNodesMergedAabbKernel);
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_computeAdjacentPairCommonPrefixKernel);
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_findDistanceFromRootKernel);
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_findLeafIndexRangesKernel);
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_plbvhRayTraverseKernel);
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_plbvhLargeAabbAabbTestKernel);
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
b3Assert(m_plbvhLargeAabbRayTestKernel);
}
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
{
clReleaseKernel(m_separateAabbsKernel);
clReleaseKernel(m_findAllNodesMergedAabbKernel);
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
clReleaseKernel(m_findDistanceFromRootKernel);
clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
clReleaseKernel(m_findLeafIndexRangesKernel);
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
clReleaseKernel(m_plbvhRayTraverseKernel);
clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
clReleaseProgram(m_parallelLinearBvhProgram);
}
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices)
{
B3_PROFILE("b3ParallelLinearBvh::build()");
int numLargeAabbs = largeAabbIndices.size();
int numSmallAabbs = smallAabbIndices.size();
//Since all AABBs(both large and small) are input as a contiguous array,
//with 2 additional arrays used to indicate the indices of large and small AABBs,
//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
{
B3_PROFILE("Separate large and small AABBs");
m_largeAabbs.resize(numLargeAabbs);
m_leafNodeAabbs.resize(numSmallAabbs);
//Write large AABBs into m_largeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
b3BufferInfoCL( m_largeAabbs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLargeAabbs);
launcher.launch1D(numLargeAabbs);
}
//Write small AABBs into m_leafNodeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numSmallAabbs);
launcher.launch1D(numSmallAabbs);
}
clFinish(m_queue);
}
//
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
int numInternalNodes = numLeaves - 1;
if(numLeaves < 2)
{
//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
int rootNodeIndex = numLeaves - 1;
m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
if(numLeaves == 1)
{
b3SortData leaf;
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
m_mortonCodesAndAabbIndicies.resize(1);
m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
}
return;
}
//
{
m_internalNodeAabbs.resize(numInternalNodes);
m_internalNodeLeafIndexRanges.resize(numInternalNodes);
m_internalNodeChildNodes.resize(numInternalNodes);
m_internalNodeParentNodes.resize(numInternalNodes);
m_commonPrefixes.resize(numInternalNodes);
m_commonPrefixLengths.resize(numInternalNodes);
m_distanceFromRoot.resize(numInternalNodes);
m_leafNodeParentNodes.resize(numLeaves);
m_mortonCodesAndAabbIndicies.resize(numLeaves);
m_mergedAabb.resize(numLeaves);
}
//Find the merged AABB of all small AABBs; this is used to define the size of
//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
{
B3_PROFILE("Find AABB of merged nodes");
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0]
};
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numAabbsNeedingMerge);
launcher.launch1D(numAabbsNeedingMerge);
}
clFinish(m_queue);
}
//Insert the center of the AABBs into a virtual grid,
//then convert the discrete grid coordinates into a morton code
//For each element in m_mortonCodesAndAabbIndicies, set
// m_key == morton code (value to sort by)
// m_value == small AABB index
{
B3_PROFILE("Assign morton codes");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//
{
B3_PROFILE("Sort leaves by morton codes");
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
clFinish(m_queue);
}
//
constructBinaryRadixTree();
//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
//The child nodes of each node split their parent's index range into 2 contiguous halves.
//
//For example, if the root has indices [0, 31], its children might partition that range into [0, 11] and [12, 31].
//The next level in the tree could then split those ranges into [0, 2], [3, 11], [12, 22], and [23, 31].
//
//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
{
B3_PROFILE("m_findLeafIndexRangesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
}
void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs)
{
int maxPairs = out_overlappingPairs.size();
b3OpenCLArray<int>& numPairsGpu = m_temp;
int reset = 0;
numPairsGpu.copyFromHostPointer(&reset, 1);
//
if( m_leafNodeAabbs.size() > 1 )
{
B3_PROFILE("PLBVH small-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(maxPairs);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
{
B3_PROFILE("PLBVH large-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(maxPairs);
launcher.setConst(numLargeAabbRigids);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
//
int numPairs = -1;
numPairsGpu.copyToHostPointer(&numPairs, 1);
if(numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
numPairsGpu.copyFromHostPointer(&maxPairs, 1);
}
out_overlappingPairs.resize(numPairs);
}
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
{
B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
int numRays = rays.size();
int maxRayRigidPairs = out_rayRigidPairs.size();
int reset = 0;
out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
//
if( m_leafNodeAabbs.size() > 0 )
{
B3_PROFILE("PLBVH ray test small AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( rays.getBufferCL() ),
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if(numLargeAabbRigids > 0)
{
B3_PROFILE("PLBVH ray test large AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
b3BufferInfoCL( rays.getBufferCL() ),
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLargeAabbRigids);
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
//
int numRayRigidPairs = -1;
out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
if(numRayRigidPairs > maxRayRigidPairs)
b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
}
void b3GpuParallelLinearBvh::constructBinaryRadixTree()
{
B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
int numLeaves = m_leafNodeAabbs.size();
int numInternalNodes = numLeaves - 1;
//Each internal node is placed in between 2 leaf nodes.
//By using this arrangement and computing the common prefix between
//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
{
B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//For each leaf node, select its parent node by
//comparing the 2 nearest internal nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//For each internal node, perform 2 binary searches among the other internal nodes
//to its left and right to find its potential parent nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Find the number of nodes seperating each internal node and the root node
//so that the AABBs can be set using the next kernel.
//Also determine the maximum number of nodes separating an internal node and the root node.
{
B3_PROFILE("m_findDistanceFromRootKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Starting from the internal nodes nearest to the leaf nodes, recursively move up
//the tree towards the root to set the AABBs of each internal node; each internal node
//checks its children and merges their AABBs
{
B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
int maxDistanceFromRoot = -1;
{
B3_PROFILE("copy maxDistanceFromRoot to CPU");
m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
clFinish(m_queue);
}
for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(maxDistanceFromRoot);
launcher.setConst(distanceFromRoot);
launcher.setConst(numInternalNodes);
//It may seem inefficent to launch a thread for each internal node when a
//much smaller number of nodes is actually processed, but this is actually
//faster than determining the exact nodes that are ready to merge their child AABBs.
launcher.launch1D(numInternalNodes);
}
clFinish(m_queue);
}
}

View File

@@ -0,0 +1,125 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_H
#define B3_GPU_PARALLEL_LINEAR_BVH_H
//#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3Common/shared/b3Int2.h"
#include "Bullet3Common/shared/b3Int4.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h"
#include "Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h"
#define b3Int64 cl_long
///@brief GPU Parallel Linearized Bounding Volume Heirarchy(LBVH) that is reconstructed every frame
///@remarks
///See presentation in docs/b3GpuParallelLinearBvh.pdf for algorithm details.
///@par
///Related papers: \n
///"Fast BVH Construction on GPUs" [Lauterbach et al. 2009] \n
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
///@par
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Sort morton codes
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
/// - [somewhat parallel] Set internal node AABBs
///@par
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
///Instead of searching for the child nodes of each internal node, we search for the parent node of each node.
///Additionally, a non-atomic traversal that starts from the leaf nodes and moves towards the root node is used to set the AABBs.
class b3GpuParallelLinearBvh
{
cl_command_queue m_queue;
cl_program m_parallelLinearBvhProgram;
cl_kernel m_separateAabbsKernel;
cl_kernel m_findAllNodesMergedAabbKernel;
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
//Binary radix tree construction kernels
cl_kernel m_computeAdjacentPairCommonPrefixKernel;
cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
cl_kernel m_findDistanceFromRootKernel;
cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
cl_kernel m_findLeafIndexRangesKernel;
//Traversal kernels
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
cl_kernel m_plbvhRayTraverseKernel;
cl_kernel m_plbvhLargeAabbAabbTestKernel;
cl_kernel m_plbvhLargeAabbRayTestKernel;
b3RadixSort32CL m_radixSorter;
//1 element
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
//1 element per internal node; for binary radix tree construction
b3OpenCLArray<b3Int64> m_commonPrefixes;
b3OpenCLArray<int> m_commonPrefixLengths;
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
//1 element per leaf node (leaf nodes only include small AABBs)
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
//1 element per large AABB, which is not stored in the BVH
b3OpenCLArray<b3SapAabb> m_largeAabbs;
public:
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvh();
///Must be called before any other function
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices);
///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
private:
void constructBinaryRadixTree();
};
#endif

View File

@@ -0,0 +1,80 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#include "b3GpuParallelLinearBvhBroadphase.h"
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :
m_plbvh(context, device, queue),
m_overlappingPairsGpu(context, queue),
m_aabbsGpu(context, queue),
m_smallAabbsMappingGpu(context, queue),
m_largeAabbsMappingGpu(context, queue)
{
}
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_smallAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_largeAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
{
//Reconstruct BVH
m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
//
m_overlappingPairsGpu.resize(maxPairs);
m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
}
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
b3Assert(0); //CPU version not implemented
}
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
{
m_aabbsGpu.copyFromHost(m_aabbsCpu);
m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
}

View File

@@ -0,0 +1,66 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
#ifndef B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
#define B3_GPU_PARALLEL_LINEAR_BVH_BROADPHASE_H
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
#include "b3GpuParallelLinearBvh.h"
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
{
b3GpuParallelLinearBvh m_plbvh;
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
b3OpenCLArray<int> m_smallAabbsMappingGpu;
b3OpenCLArray<int> m_largeAabbsMappingGpu;
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
public:
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvhBroadphase() {}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, short int collisionFilterGroup, short int collisionFilterMask);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
{
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
}
};
#endif

View File

@@ -1307,3 +1307,16 @@ cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
{
return m_overlappingPairs.getBufferCL();
}
b3OpenCLArray<b3Int4>& b3GpuSapBroadphase::getOverlappingPairsGPU()
{
return m_overlappingPairs;
}
b3OpenCLArray<int>& b3GpuSapBroadphase::getSmallAabbIndicesGPU()
{
return m_smallAabbsMappingGPU;
}
b3OpenCLArray<int>& b3GpuSapBroadphase::getLargeAabbIndicesGPU()
{
return m_largeAabbsMappingGPU;
}

View File

@@ -143,6 +143,9 @@ public:
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
#endif //B3_GPU_SAP_BROADPHASE_H

View File

@@ -0,0 +1,767 @@
/*
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Initial Author Jackson Lee, 2014
typedef float b3Scalar;
typedef float4 b3Vector3;
#define b3Max max
#define b3Min min
#define b3Sqrt sqrt
typedef struct
{
unsigned int m_key;
unsigned int m_value;
} SortDataCL;
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} b3AabbCL;
unsigned int interleaveBits(unsigned int x)
{
//........ ........ ......12 3456789A //x
//....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits
//......12 3456789A ......12 3456789A //x ^ (x << 16)
//11111111 ........ ........ 11111111 //0x FF 00 00 FF
//......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;
//......12 ........ 3456789A 3456789A //x ^ (x << 8)
//......11 ........ 1111.... ....1111 //0x 03 00 F0 0F
//......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;
//..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)
//......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3
//......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;
//....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)
//....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49
//....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;
//........ ........ ......11 11111111 //0x000003FF
x &= 0x000003FF; //Clear all bits above bit 10
x = (x ^ (x << 16)) & 0xFF0000FF;
x = (x ^ (x << 8)) & 0x0300F00F;
x = (x ^ (x << 4)) & 0x030C30C3;
x = (x ^ (x << 2)) & 0x09249249;
return x;
}
unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)
{
return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;
}
__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)
{
int separatedAabbIndex = get_global_id(0);
if(separatedAabbIndex >= numAabbsToSeparate) return;
int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];
out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];
}
//Should replace with an optimized parallel reduction
__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)
{
//Each time this kernel is added to the command queue,
//the number of AABBs needing to be merged is halved
//
//Example with 159 AABBs:
// numRemainingAabbs == 159 / 2 + 159 % 2 == 80
// numMergedAabbs == 159 - 80 == 79
//So, indices [0, 78] are merged with [0 + 80, 78 + 80]
int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;
int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;
int aabbIndex = get_global_id(0);
if(aabbIndex >= numMergedAabbs) return;
int otherAabbIndex = aabbIndex + numRemainingAabbs;
b3AabbCL aabb = out_mergedAabb[aabbIndex];
b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];
b3AabbCL mergedAabb;
mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);
mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);
out_mergedAabb[aabbIndex] = mergedAabb;
}
__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes,
__global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)
{
int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index
if(leafNodeIndex >= numAabbs) return;
b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];
b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;
b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;
b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];
b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;
b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;
//Quantize into integer coordinates
//floor() is needed to prevent the center cell, at (0,0,0) from being twice the size
b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;
int4 discretePosition;
discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );
discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );
discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );
//Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]
discretePosition = b3Max( -512, b3Min(discretePosition, 511) );
discretePosition += 512;
//Interleave bits(assign a morton code, also known as a z-curve)
unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);
//
SortDataCL mortonCodeIndexPair;
mortonCodeIndexPair.m_key = mortonCode;
mortonCodeIndexPair.m_value = leafNodeIndex;
out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;
}
#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128
//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.
//If it is set, then the index is for an internal node; otherwise, it is a leaf node.
//In both cases, the bit should be cleared to access the actual node index.
int isLeafNode(int index) { return (index >> 31 == 0); }
int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }
int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }
//From sap.cl
#define NEW_PAIR_MARKER -1
bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
//From sap.cl
__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs,
__global int* rootNodeIndex,
__global int2* internalNodeChildIndices,
__global b3AabbCL* internalNodeAabbs,
__global int2* internalNodeLeafIndexRanges,
__global SortDataCL* mortonCodesAndAabbIndices,
__global int* out_numPairs, __global int4* out_overlappingPairs,
int maxPairs, int numQueryAabbs)
{
//Using get_group_id()/get_local_id() is Faster than get_global_id(0) since
//mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)
int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
if(queryBvhNodeIndex >= numQueryAabbs) return;
int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;
b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];
int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
int stackSize = 1;
stack[0] = *rootNodeIndex;
while(stackSize)
{
int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
--stackSize;
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
//Optimization - if the BVH is structured as a binary radix tree, then
//each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).
//This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.
{
int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;
if(highestLeafIndex <= queryBvhNodeIndex) continue;
}
//bvhRigidIndex is not used if internal node
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )
{
if(isLeaf)
{
int4 pair;
pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];
pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
pair.z = NEW_PAIR_MARKER;
pair.w = NEW_PAIR_MARKER;
int pairIndex = atomic_inc(out_numPairs);
if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
}
if(!isLeaf) //Internal node
{
if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
{
//Error
}
else
{
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
}
}
}
}
}
//From rayCastKernels.cl
typedef struct
{
float4 m_from;
float4 m_to;
} b3RayInfo;
//From rayCastKernels.cl
b3Vector3 b3Vector3_normalize(b3Vector3 v)
{
b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};
return normalize(normal); //OpenCL normalize == vector4 normalize
}
b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }
int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)
{
//AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).
//t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.
//
//if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane
//and min.x will be the far plane; otherwise, it is reversed.
//
//In order for there to be a collision, the t_min and t_max of each pair must overlap.
//This can be tested for by selecting the highest t_min and lowest t_max and comparing them.
int4 isNegative = isless( rayNormalizedDirection, (b3Vector3){0.0f, 0.0f, 0.0f, 0.0f} ); //isless(x,y) returns (x < y)
//When using vector types, the select() function checks the most signficant bit,
//but isless() sets the least significant bit.
isNegative <<= 31;
//select(b, a, condition) == condition ? a : b
//When using select() with vector types, (condition[i]) is true if its most significant bit is 1
b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;
b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;
b3Scalar t_min_final = 0.0f;
b3Scalar t_max_final = rayLength;
//Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned.
//Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])
//Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.
t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );
t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );
return (t_min_final <= t_max_final);
}
__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,
__global int* rootNodeIndex,
__global int2* internalNodeChildIndices,
__global b3AabbCL* internalNodeAabbs,
__global int2* internalNodeLeafIndexRanges,
__global SortDataCL* mortonCodesAndAabbIndices,
__global b3RayInfo* rays,
__global int* out_numRayRigidPairs,
__global int2* out_rayRigidPairs,
int maxRayRigidPairs, int numRays)
{
int rayIndex = get_global_id(0);
if(rayIndex >= numRays) return;
//
b3Vector3 rayFrom = rays[rayIndex].m_from;
b3Vector3 rayTo = rays[rayIndex].m_to;
b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
//
int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];
int stackSize = 1;
stack[0] = *rootNodeIndex;
while(stackSize)
{
int internalOrLeafNodeIndex = stack[ stackSize - 1 ];
--stackSize;
int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false
int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);
//bvhRigidIndex is not used if internal node
int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;
b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];
if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )
{
if(isLeaf)
{
int2 rayRigidPair;
rayRigidPair.x = rayIndex;
rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];
int pairIndex = atomic_inc(out_numRayRigidPairs);
if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
}
if(!isLeaf) //Internal node
{
if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)
{
//Error
}
else
{
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;
stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;
}
}
}
}
}
__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs,
__global int* out_numPairs, __global int4* out_overlappingPairs,
int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)
{
int smallAabbIndex = get_global_id(0);
if(smallAabbIndex >= numSmallAabbRigids) return;
b3AabbCL smallAabb = smallAabbs[smallAabbIndex];
for(int i = 0; i < numLargeAabbRigids; ++i)
{
b3AabbCL largeAabb = largeAabbs[i];
if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )
{
int4 pair;
pair.x = largeAabb.m_minIndices[3];
pair.y = smallAabb.m_minIndices[3];
pair.z = NEW_PAIR_MARKER;
pair.w = NEW_PAIR_MARKER;
int pairIndex = atomic_inc(out_numPairs);
if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;
}
}
}
__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,
__global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,
int numLargeAabbRigids, int maxRayRigidPairs, int numRays)
{
int rayIndex = get_global_id(0);
if(rayIndex >= numRays) return;
b3Vector3 rayFrom = rays[rayIndex].m_from;
b3Vector3 rayTo = rays[rayIndex].m_to;
b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);
b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );
for(int i = 0; i < numLargeAabbRigids; ++i)
{
b3AabbCL rigidAabb = largeRigidAabbs[i];
if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )
{
int2 rayRigidPair;
rayRigidPair.x = rayIndex;
rayRigidPair.y = rigidAabb.m_minIndices[3];
int pairIndex = atomic_inc(out_numRayRigidPairs);
if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;
}
}
}
//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.
//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.
//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.
//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).
#define B3_PLBVH_INVALID_COMMON_PREFIX 128
#define B3_PLBVH_ROOT_NODE_MARKER -1
#define b3Int64 long
int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }
b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j)
{
//This function only needs to return (i & j) in order for the algorithm to work,
//but it may help with debugging to mask out the lower bits.
b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);
b3Int64 sharedBits = i & j;
b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0
return sharedBits & bitmask;
}
//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths
int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)
{
return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );
}
__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,
__global b3Int64* out_commonPrefixes,
__global int* out_commonPrefixLengths,
int numInternalNodes)
{
int internalNodeIndex = get_global_id(0);
if (internalNodeIndex >= numInternalNodes) return;
//Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,
//and the number of internal nodes is always numLeafNodes - 1
int leftLeafIndex = internalNodeIndex;
int rightLeafIndex = internalNodeIndex + 1;
int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;
int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;
//Binary radix tree construction algorithm does not work if there are duplicate morton codes.
//Append the index of each leaf node to each morton code so that there are no duplicates.
//The algorithm also requires that the morton codes are sorted in ascending order; this requirement
//is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.
//
//upsample(a, b) == ( ((b3Int64)a) << 32) | b
b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);
b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);
out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);
}
__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,
__global int2* out_childNodes, int numLeafNodes)
{
int leafNodeIndex = get_global_id(0);
if (leafNodeIndex >= numLeafNodes) return;
int numInternalNodes = numLeafNodes - 1;
int leftSplitIndex = leafNodeIndex - 1;
int rightSplitIndex = leafNodeIndex;
int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
//Parent node is the highest adjacent common prefix that is lower than the node's common prefix
//Leaf nodes are considered as having the highest common prefix
int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);
//Handle cases for the edge nodes; the first and last node
//For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX
if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;
if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;
int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;
out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;
int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa
//out_childNodesAsInt[0] == int2.x == left child
//out_childNodesAsInt[1] == int2.y == right child
int isLeaf = 1;
__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);
}
__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,
__global int2* out_childNodes,
__global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,
int numInternalNodes)
{
int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);
if(internalNodeIndex >= numInternalNodes) return;
b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];
int nodePrefixLength = commonPrefixLengths[internalNodeIndex];
//#define USE_LINEAR_SEARCH
#ifdef USE_LINEAR_SEARCH
int leftIndex = -1;
int rightIndex = -1;
//Find nearest element to left with a lower common prefix
for(int i = internalNodeIndex - 1; i >= 0; --i)
{
int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
if(nodeLeftSharedPrefixLength < nodePrefixLength)
{
leftIndex = i;
break;
}
}
//Find nearest element to right with a lower common prefix
for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)
{
int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);
if(nodeRightSharedPrefixLength < nodePrefixLength)
{
rightIndex = i;
break;
}
}
#else //Use binary search
//Find nearest element to left with a lower common prefix
int leftIndex = -1;
{
int lower = 0;
int upper = internalNodeIndex - 1;
while(lower <= upper)
{
int mid = (lower + upper) / 2;
b3Int64 midPrefix = commonPrefixes[mid];
int midPrefixLength = commonPrefixLengths[mid];
int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
if(nodeMidSharedPrefixLength < nodePrefixLength)
{
int right = mid + 1;
if(right < internalNodeIndex)
{
b3Int64 rightPrefix = commonPrefixes[right];
int rightPrefixLength = commonPrefixLengths[right];
int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);
if(nodeRightSharedPrefixLength < nodePrefixLength)
{
lower = right;
leftIndex = right;
}
else
{
leftIndex = mid;
break;
}
}
else
{
leftIndex = mid;
break;
}
}
else upper = mid - 1;
}
}
//Find nearest element to right with a lower common prefix
int rightIndex = -1;
{
int lower = internalNodeIndex + 1;
int upper = numInternalNodes - 1;
while(lower <= upper)
{
int mid = (lower + upper) / 2;
b3Int64 midPrefix = commonPrefixes[mid];
int midPrefixLength = commonPrefixLengths[mid];
int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);
if(nodeMidSharedPrefixLength < nodePrefixLength)
{
int left = mid - 1;
if(left > internalNodeIndex)
{
b3Int64 leftPrefix = commonPrefixes[left];
int leftPrefixLength = commonPrefixLengths[left];
int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);
if(nodeLeftSharedPrefixLength < nodePrefixLength)
{
upper = left;
rightIndex = left;
}
else
{
rightIndex = mid;
break;
}
}
else
{
rightIndex = mid;
break;
}
}
else lower = mid + 1;
}
}
#endif
//Select parent
{
int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;
int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);
if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;
else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;
int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;
int isRootNode = (leftIndex == -1 && rightIndex == -1);
out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;
int isLeaf = 0;
if(!isRootNode)
{
int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa
//out_childNodesAsInt[0] == int2.x == left child
//out_childNodesAsInt[1] == int2.y == right child
__global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);
out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
}
else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);
}
}
__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,
__global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)
{
if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);
int internalNodeIndex = get_global_id(0);
if(internalNodeIndex >= numInternalNodes) return;
//
int distanceFromRoot = 0;
{
int parentIndex = internalNodeParentNodes[internalNodeIndex];
while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)
{
parentIndex = internalNodeParentNodes[parentIndex];
++distanceFromRoot;
}
}
out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;
//
__local int localMaxDistanceFromRoot;
if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;
barrier(CLK_LOCAL_MEM_FENCE);
atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);
barrier(CLK_LOCAL_MEM_FENCE);
if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);
}
__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,
__global int2* childNodes,
__global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,
int maxDistanceFromRoot, int processedDistance, int numInternalNodes)
{
int internalNodeIndex = get_global_id(0);
if(internalNodeIndex >= numInternalNodes) return;
int distance = distanceFromRoot[internalNodeIndex];
if(distance == processedDistance)
{
int leftChildIndex = childNodes[internalNodeIndex].x;
int rightChildIndex = childNodes[internalNodeIndex].y;
int isLeftChildLeaf = isLeafNode(leftChildIndex);
int isRightChildLeaf = isLeafNode(rightChildIndex);
leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);
rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);
//leftRigidIndex/rightRigidIndex is not used if internal node
int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;
int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;
b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];
b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];
b3AabbCL mergedAabb;
mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);
mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);
internalNodeAabbs[internalNodeIndex] = mergedAabb;
}
}
__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)
{
int internalNodeIndex = get_global_id(0);
if(internalNodeIndex >= numInternalNodes) return;
int numLeafNodes = numInternalNodes + 1;
int2 childNodes = internalNodeChildNodes[internalNodeIndex];
int2 leafIndexRange; //x == min leaf index, y == max leaf index
//Find lowest leaf index covered by this internal node
{
int lowestIndex = childNodes.x; //childNodes.x == Left child
while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;
leafIndexRange.x = lowestIndex;
}
//Find highest leaf index covered by this internal node
{
int highestIndex = childNodes.y; //childNodes.y == Right child
while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;
leafIndexRange.y = highestIndex;
}
//
out_leafIndexRanges[internalNodeIndex] = leafIndexRange;
}

View File

@@ -0,0 +1,729 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* parallelLinearBvhCL= \
"/*\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose,\n"
"including commercial applications, and to alter it and redistribute it freely,\n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Initial Author Jackson Lee, 2014\n"
"typedef float b3Scalar;\n"
"typedef float4 b3Vector3;\n"
"#define b3Max max\n"
"#define b3Min min\n"
"#define b3Sqrt sqrt\n"
"typedef struct\n"
"{\n"
" unsigned int m_key;\n"
" unsigned int m_value;\n"
"} SortDataCL;\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"unsigned int interleaveBits(unsigned int x)\n"
"{\n"
" //........ ........ ......12 3456789A //x\n"
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x after interleaving bits\n"
" \n"
" //......12 3456789A ......12 3456789A //x ^ (x << 16)\n"
" //11111111 ........ ........ 11111111 //0x FF 00 00 FF\n"
" //......12 ........ ........ 3456789A //x = (x ^ (x << 16)) & 0xFF0000FF;\n"
" \n"
" //......12 ........ 3456789A 3456789A //x ^ (x << 8)\n"
" //......11 ........ 1111.... ....1111 //0x 03 00 F0 0F\n"
" //......12 ........ 3456.... ....789A //x = (x ^ (x << 8)) & 0x0300F00F;\n"
" \n"
" //..12..12 ....3456 3456.... 789A789A //x ^ (x << 4)\n"
" //......11 ....11.. ..11.... 11....11 //0x 03 0C 30 C3\n"
" //......12 ....34.. ..56.... 78....9A //x = (x ^ (x << 4)) & 0x030C30C3;\n"
" \n"
" //....1212 ..3434.. 5656..78 78..9A9A //x ^ (x << 2)\n"
" //....1..1 ..1..1.. 1..1..1. .1..1..1 //0x 09 24 92 49\n"
" //....1..2 ..3..4.. 5..6..7. .8..9..A //x = (x ^ (x << 2)) & 0x09249249;\n"
" \n"
" //........ ........ ......11 11111111 //0x000003FF\n"
" x &= 0x000003FF; //Clear all bits above bit 10\n"
" \n"
" x = (x ^ (x << 16)) & 0xFF0000FF;\n"
" x = (x ^ (x << 8)) & 0x0300F00F;\n"
" x = (x ^ (x << 4)) & 0x030C30C3;\n"
" x = (x ^ (x << 2)) & 0x09249249;\n"
" \n"
" return x;\n"
"}\n"
"unsigned int getMortonCode(unsigned int x, unsigned int y, unsigned int z)\n"
"{\n"
" return interleaveBits(x) << 0 | interleaveBits(y) << 1 | interleaveBits(z) << 2;\n"
"}\n"
"__kernel void separateAabbs(__global b3AabbCL* unseparatedAabbs, __global int* aabbIndices, __global b3AabbCL* out_aabbs, int numAabbsToSeparate)\n"
"{\n"
" int separatedAabbIndex = get_global_id(0);\n"
" if(separatedAabbIndex >= numAabbsToSeparate) return;\n"
" int unseparatedAabbIndex = aabbIndices[separatedAabbIndex];\n"
" out_aabbs[separatedAabbIndex] = unseparatedAabbs[unseparatedAabbIndex];\n"
"}\n"
"//Should replace with an optimized parallel reduction\n"
"__kernel void findAllNodesMergedAabb(__global b3AabbCL* out_mergedAabb, int numAabbsNeedingMerge)\n"
"{\n"
" //Each time this kernel is added to the command queue, \n"
" //the number of AABBs needing to be merged is halved\n"
" //\n"
" //Example with 159 AABBs:\n"
" // numRemainingAabbs == 159 / 2 + 159 % 2 == 80\n"
" // numMergedAabbs == 159 - 80 == 79\n"
" //So, indices [0, 78] are merged with [0 + 80, 78 + 80]\n"
" \n"
" int numRemainingAabbs = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2;\n"
" int numMergedAabbs = numAabbsNeedingMerge - numRemainingAabbs;\n"
" \n"
" int aabbIndex = get_global_id(0);\n"
" if(aabbIndex >= numMergedAabbs) return;\n"
" \n"
" int otherAabbIndex = aabbIndex + numRemainingAabbs;\n"
" \n"
" b3AabbCL aabb = out_mergedAabb[aabbIndex];\n"
" b3AabbCL otherAabb = out_mergedAabb[otherAabbIndex];\n"
" \n"
" b3AabbCL mergedAabb;\n"
" mergedAabb.m_min = b3Min(aabb.m_min, otherAabb.m_min);\n"
" mergedAabb.m_max = b3Max(aabb.m_max, otherAabb.m_max);\n"
" out_mergedAabb[aabbIndex] = mergedAabb;\n"
"}\n"
"__kernel void assignMortonCodesAndAabbIndicies(__global b3AabbCL* worldSpaceAabbs, __global b3AabbCL* mergedAabbOfAllNodes, \n"
" __global SortDataCL* out_mortonCodesAndAabbIndices, int numAabbs)\n"
"{\n"
" int leafNodeIndex = get_global_id(0); //Leaf node index == AABB index\n"
" if(leafNodeIndex >= numAabbs) return;\n"
" \n"
" b3AabbCL mergedAabb = mergedAabbOfAllNodes[0];\n"
" b3Vector3 gridCenter = (mergedAabb.m_min + mergedAabb.m_max) * 0.5f;\n"
" b3Vector3 gridCellSize = (mergedAabb.m_max - mergedAabb.m_min) / (float)1024;\n"
" \n"
" b3AabbCL aabb = worldSpaceAabbs[leafNodeIndex];\n"
" b3Vector3 aabbCenter = (aabb.m_min + aabb.m_max) * 0.5f;\n"
" b3Vector3 aabbCenterRelativeToGrid = aabbCenter - gridCenter;\n"
" \n"
" //Quantize into integer coordinates\n"
" //floor() is needed to prevent the center cell, at (0,0,0) from being twice the size\n"
" b3Vector3 gridPosition = aabbCenterRelativeToGrid / gridCellSize;\n"
" \n"
" int4 discretePosition;\n"
" discretePosition.x = (int)( (gridPosition.x >= 0.0f) ? gridPosition.x : floor(gridPosition.x) );\n"
" discretePosition.y = (int)( (gridPosition.y >= 0.0f) ? gridPosition.y : floor(gridPosition.y) );\n"
" discretePosition.z = (int)( (gridPosition.z >= 0.0f) ? gridPosition.z : floor(gridPosition.z) );\n"
" \n"
" //Clamp coordinates into [-512, 511], then convert range from [-512, 511] to [0, 1023]\n"
" discretePosition = b3Max( -512, b3Min(discretePosition, 511) );\n"
" discretePosition += 512;\n"
" \n"
" //Interleave bits(assign a morton code, also known as a z-curve)\n"
" unsigned int mortonCode = getMortonCode(discretePosition.x, discretePosition.y, discretePosition.z);\n"
" \n"
" //\n"
" SortDataCL mortonCodeIndexPair;\n"
" mortonCodeIndexPair.m_key = mortonCode;\n"
" mortonCodeIndexPair.m_value = leafNodeIndex;\n"
" \n"
" out_mortonCodesAndAabbIndices[leafNodeIndex] = mortonCodeIndexPair;\n"
"}\n"
"#define B3_PLVBH_TRAVERSE_MAX_STACK_SIZE 128\n"
"//The most significant bit(0x80000000) of a int32 is used to distinguish between leaf and internal nodes.\n"
"//If it is set, then the index is for an internal node; otherwise, it is a leaf node. \n"
"//In both cases, the bit should be cleared to access the actual node index.\n"
"int isLeafNode(int index) { return (index >> 31 == 0); }\n"
"int getIndexWithInternalNodeMarkerRemoved(int index) { return index & (~0x80000000); }\n"
"int getIndexWithInternalNodeMarkerSet(int isLeaf, int index) { return (isLeaf) ? index : (index | 0x80000000); }\n"
"//From sap.cl\n"
"#define NEW_PAIR_MARKER -1\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"//From sap.cl\n"
"__kernel void plbvhCalculateOverlappingPairs(__global b3AabbCL* rigidAabbs, \n"
" __global int* rootNodeIndex, \n"
" __global int2* internalNodeChildIndices, \n"
" __global b3AabbCL* internalNodeAabbs,\n"
" __global int2* internalNodeLeafIndexRanges,\n"
" \n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
" int maxPairs, int numQueryAabbs)\n"
"{\n"
" //Using get_group_id()/get_local_id() is Faster than get_global_id(0) since\n"
" //mortonCodesAndAabbIndices[] contains rigid body indices sorted along the z-curve (more spatially coherent)\n"
" int queryBvhNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
" if(queryBvhNodeIndex >= numQueryAabbs) return;\n"
" \n"
" int queryRigidIndex = mortonCodesAndAabbIndices[queryBvhNodeIndex].m_value;\n"
" b3AabbCL queryAabb = rigidAabbs[queryRigidIndex];\n"
" \n"
" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
" \n"
" int stackSize = 1;\n"
" stack[0] = *rootNodeIndex;\n"
" \n"
" while(stackSize)\n"
" {\n"
" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
" --stackSize;\n"
" \n"
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
" \n"
" //Optimization - if the BVH is structured as a binary radix tree, then\n"
" //each internal node corresponds to a contiguous range of leaf nodes(internalNodeLeafIndexRanges[]).\n"
" //This can be used to avoid testing each AABB-AABB pair twice, including preventing each node from colliding with itself.\n"
" {\n"
" int highestLeafIndex = (isLeaf) ? bvhNodeIndex : internalNodeLeafIndexRanges[bvhNodeIndex].y;\n"
" if(highestLeafIndex <= queryBvhNodeIndex) continue;\n"
" }\n"
" \n"
" //bvhRigidIndex is not used if internal node\n"
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
" \n"
" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
" if( TestAabbAgainstAabb2(&queryAabb, &bvhNodeAabb) )\n"
" {\n"
" if(isLeaf)\n"
" {\n"
" int4 pair;\n"
" pair.x = rigidAabbs[queryRigidIndex].m_minIndices[3];\n"
" pair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
" pair.z = NEW_PAIR_MARKER;\n"
" pair.w = NEW_PAIR_MARKER;\n"
" \n"
" int pairIndex = atomic_inc(out_numPairs);\n"
" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
" }\n"
" \n"
" if(!isLeaf) //Internal node\n"
" {\n"
" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
" {\n"
" //Error\n"
" }\n"
" else\n"
" {\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
" }\n"
" }\n"
" }\n"
" \n"
" }\n"
"}\n"
"//From rayCastKernels.cl\n"
"typedef struct\n"
"{\n"
" float4 m_from;\n"
" float4 m_to;\n"
"} b3RayInfo;\n"
"//From rayCastKernels.cl\n"
"b3Vector3 b3Vector3_normalize(b3Vector3 v)\n"
"{\n"
" b3Vector3 normal = (b3Vector3){v.x, v.y, v.z, 0.f};\n"
" return normalize(normal); //OpenCL normalize == vector4 normalize\n"
"}\n"
"b3Scalar b3Vector3_length2(b3Vector3 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }\n"
"b3Scalar b3Vector3_dot(b3Vector3 a, b3Vector3 b) { return a.x*b.x + a.y*b.y + a.z*b.z; }\n"
"int rayIntersectsAabb(b3Vector3 rayOrigin, b3Scalar rayLength, b3Vector3 rayNormalizedDirection, b3AabbCL aabb)\n"
"{\n"
" //AABB is considered as 3 pairs of 2 planes( {x_min, x_max}, {y_min, y_max}, {z_min, z_max} ).\n"
" //t_min is the point of intersection with the closer plane, t_max is the point of intersection with the farther plane.\n"
" //\n"
" //if (rayNormalizedDirection.x < 0.0f), then max.x will be the near plane \n"
" //and min.x will be the far plane; otherwise, it is reversed.\n"
" //\n"
" //In order for there to be a collision, the t_min and t_max of each pair must overlap.\n"
" //This can be tested for by selecting the highest t_min and lowest t_max and comparing them.\n"
" \n"
" int4 isNegative = isless( rayNormalizedDirection, (b3Vector3){0.0f, 0.0f, 0.0f, 0.0f} ); //isless(x,y) returns (x < y)\n"
" \n"
" //When using vector types, the select() function checks the most signficant bit, \n"
" //but isless() sets the least significant bit.\n"
" isNegative <<= 31;\n"
" //select(b, a, condition) == condition ? a : b\n"
" //When using select() with vector types, (condition[i]) is true if its most significant bit is 1\n"
" b3Vector3 t_min = ( select(aabb.m_min, aabb.m_max, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
" b3Vector3 t_max = ( select(aabb.m_max, aabb.m_min, isNegative) - rayOrigin ) / rayNormalizedDirection;\n"
" \n"
" b3Scalar t_min_final = 0.0f;\n"
" b3Scalar t_max_final = rayLength;\n"
" \n"
" //Must use fmin()/fmax(); if one of the parameters is NaN, then the parameter that is not NaN is returned. \n"
" //Behavior of min()/max() with NaNs is undefined. (See OpenCL Specification 1.2 [6.12.2] and [6.12.4])\n"
" //Since the innermost fmin()/fmax() is always not NaN, this should never return NaN.\n"
" t_min_final = fmax( t_min.z, fmax(t_min.y, fmax(t_min.x, t_min_final)) );\n"
" t_max_final = fmin( t_max.z, fmin(t_max.y, fmin(t_max.x, t_max_final)) );\n"
" \n"
" return (t_min_final <= t_max_final);\n"
"}\n"
"__kernel void plbvhRayTraverse(__global b3AabbCL* rigidAabbs,\n"
" __global int* rootNodeIndex, \n"
" __global int2* internalNodeChildIndices, \n"
" __global b3AabbCL* internalNodeAabbs,\n"
" __global int2* internalNodeLeafIndexRanges,\n"
" __global SortDataCL* mortonCodesAndAabbIndices,\n"
" \n"
" __global b3RayInfo* rays,\n"
" \n"
" __global int* out_numRayRigidPairs, \n"
" __global int2* out_rayRigidPairs,\n"
" int maxRayRigidPairs, int numRays)\n"
"{\n"
" int rayIndex = get_global_id(0);\n"
" if(rayIndex >= numRays) return;\n"
" \n"
" //\n"
" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
" \n"
" //\n"
" int stack[B3_PLVBH_TRAVERSE_MAX_STACK_SIZE];\n"
" \n"
" int stackSize = 1;\n"
" stack[0] = *rootNodeIndex;\n"
" \n"
" while(stackSize)\n"
" {\n"
" int internalOrLeafNodeIndex = stack[ stackSize - 1 ];\n"
" --stackSize;\n"
" \n"
" int isLeaf = isLeafNode(internalOrLeafNodeIndex); //Internal node if false\n"
" int bvhNodeIndex = getIndexWithInternalNodeMarkerRemoved(internalOrLeafNodeIndex);\n"
" \n"
" //bvhRigidIndex is not used if internal node\n"
" int bvhRigidIndex = (isLeaf) ? mortonCodesAndAabbIndices[bvhNodeIndex].m_value : -1;\n"
" \n"
" b3AabbCL bvhNodeAabb = (isLeaf) ? rigidAabbs[bvhRigidIndex] : internalNodeAabbs[bvhNodeIndex];\n"
" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, bvhNodeAabb) )\n"
" {\n"
" if(isLeaf)\n"
" {\n"
" int2 rayRigidPair;\n"
" rayRigidPair.x = rayIndex;\n"
" rayRigidPair.y = rigidAabbs[bvhRigidIndex].m_minIndices[3];\n"
" \n"
" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
" }\n"
" \n"
" if(!isLeaf) //Internal node\n"
" {\n"
" if(stackSize + 2 > B3_PLVBH_TRAVERSE_MAX_STACK_SIZE)\n"
" {\n"
" //Error\n"
" }\n"
" else\n"
" {\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].x;\n"
" stack[ stackSize++ ] = internalNodeChildIndices[bvhNodeIndex].y;\n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void plbvhLargeAabbAabbTest(__global b3AabbCL* smallAabbs, __global b3AabbCL* largeAabbs, \n"
" __global int* out_numPairs, __global int4* out_overlappingPairs, \n"
" int maxPairs, int numLargeAabbRigids, int numSmallAabbRigids)\n"
"{\n"
" int smallAabbIndex = get_global_id(0);\n"
" if(smallAabbIndex >= numSmallAabbRigids) return;\n"
" \n"
" b3AabbCL smallAabb = smallAabbs[smallAabbIndex];\n"
" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
" {\n"
" b3AabbCL largeAabb = largeAabbs[i];\n"
" if( TestAabbAgainstAabb2(&smallAabb, &largeAabb) )\n"
" {\n"
" int4 pair;\n"
" pair.x = largeAabb.m_minIndices[3];\n"
" pair.y = smallAabb.m_minIndices[3];\n"
" pair.z = NEW_PAIR_MARKER;\n"
" pair.w = NEW_PAIR_MARKER;\n"
" \n"
" int pairIndex = atomic_inc(out_numPairs);\n"
" if(pairIndex < maxPairs) out_overlappingPairs[pairIndex] = pair;\n"
" }\n"
" }\n"
"}\n"
"__kernel void plbvhLargeAabbRayTest(__global b3AabbCL* largeRigidAabbs, __global b3RayInfo* rays,\n"
" __global int* out_numRayRigidPairs, __global int2* out_rayRigidPairs,\n"
" int numLargeAabbRigids, int maxRayRigidPairs, int numRays)\n"
"{\n"
" int rayIndex = get_global_id(0);\n"
" if(rayIndex >= numRays) return;\n"
" \n"
" b3Vector3 rayFrom = rays[rayIndex].m_from;\n"
" b3Vector3 rayTo = rays[rayIndex].m_to;\n"
" b3Vector3 rayNormalizedDirection = b3Vector3_normalize(rayTo - rayFrom);\n"
" b3Scalar rayLength = b3Sqrt( b3Vector3_length2(rayTo - rayFrom) );\n"
" \n"
" for(int i = 0; i < numLargeAabbRigids; ++i)\n"
" {\n"
" b3AabbCL rigidAabb = largeRigidAabbs[i];\n"
" if( rayIntersectsAabb(rayFrom, rayLength, rayNormalizedDirection, rigidAabb) )\n"
" {\n"
" int2 rayRigidPair;\n"
" rayRigidPair.x = rayIndex;\n"
" rayRigidPair.y = rigidAabb.m_minIndices[3];\n"
" \n"
" int pairIndex = atomic_inc(out_numRayRigidPairs);\n"
" if(pairIndex < maxRayRigidPairs) out_rayRigidPairs[pairIndex] = rayRigidPair;\n"
" }\n"
" }\n"
"}\n"
"//Set so that it is always greater than the actual common prefixes, and never selected as a parent node.\n"
"//If there are no duplicates, then the highest common prefix is 32 or 64, depending on the number of bits used for the z-curve.\n"
"//Duplicate common prefixes increase the highest common prefix at most by the number of bits used to index the leaf node.\n"
"//Since 32 bit ints are used to index leaf nodes, the max prefix is 64(32 + 32 bit z-curve) or 96(32 + 64 bit z-curve).\n"
"#define B3_PLBVH_INVALID_COMMON_PREFIX 128\n"
"#define B3_PLBVH_ROOT_NODE_MARKER -1\n"
"#define b3Int64 long\n"
"int computeCommonPrefixLength(b3Int64 i, b3Int64 j) { return (int)clz(i ^ j); }\n"
"b3Int64 computeCommonPrefix(b3Int64 i, b3Int64 j) \n"
"{\n"
" //This function only needs to return (i & j) in order for the algorithm to work,\n"
" //but it may help with debugging to mask out the lower bits.\n"
" b3Int64 commonPrefixLength = (b3Int64)computeCommonPrefixLength(i, j);\n"
" b3Int64 sharedBits = i & j;\n"
" b3Int64 bitmask = ((b3Int64)(~0)) << (64 - commonPrefixLength); //Set all bits after the common prefix to 0\n"
" \n"
" return sharedBits & bitmask;\n"
"}\n"
"//Same as computeCommonPrefixLength(), but allows for prefixes with different lengths\n"
"int getSharedPrefixLength(b3Int64 prefixA, int prefixLengthA, b3Int64 prefixB, int prefixLengthB)\n"
"{\n"
" return b3Min( computeCommonPrefixLength(prefixA, prefixB), b3Min(prefixLengthA, prefixLengthB) );\n"
"}\n"
"__kernel void computeAdjacentPairCommonPrefix(__global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global b3Int64* out_commonPrefixes,\n"
" __global int* out_commonPrefixLengths,\n"
" int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_global_id(0);\n"
" if (internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" //Here, (internalNodeIndex + 1) is never out of bounds since it is a leaf node index,\n"
" //and the number of internal nodes is always numLeafNodes - 1\n"
" int leftLeafIndex = internalNodeIndex;\n"
" int rightLeafIndex = internalNodeIndex + 1;\n"
" \n"
" int leftLeafMortonCode = mortonCodesAndAabbIndices[leftLeafIndex].m_key;\n"
" int rightLeafMortonCode = mortonCodesAndAabbIndices[rightLeafIndex].m_key;\n"
" \n"
" //Binary radix tree construction algorithm does not work if there are duplicate morton codes.\n"
" //Append the index of each leaf node to each morton code so that there are no duplicates.\n"
" //The algorithm also requires that the morton codes are sorted in ascending order; this requirement\n"
" //is also satisfied with this method, as (leftLeafIndex < rightLeafIndex) is always true.\n"
" //\n"
" //upsample(a, b) == ( ((b3Int64)a) << 32) | b\n"
" b3Int64 nonduplicateLeftMortonCode = upsample(leftLeafMortonCode, leftLeafIndex);\n"
" b3Int64 nonduplicateRightMortonCode = upsample(rightLeafMortonCode, rightLeafIndex);\n"
" \n"
" out_commonPrefixes[internalNodeIndex] = computeCommonPrefix(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
" out_commonPrefixLengths[internalNodeIndex] = computeCommonPrefixLength(nonduplicateLeftMortonCode, nonduplicateRightMortonCode);\n"
"}\n"
"__kernel void buildBinaryRadixTreeLeafNodes(__global int* commonPrefixLengths, __global int* out_leafNodeParentNodes,\n"
" __global int2* out_childNodes, int numLeafNodes)\n"
"{\n"
" int leafNodeIndex = get_global_id(0);\n"
" if (leafNodeIndex >= numLeafNodes) return;\n"
" \n"
" int numInternalNodes = numLeafNodes - 1;\n"
" \n"
" int leftSplitIndex = leafNodeIndex - 1;\n"
" int rightSplitIndex = leafNodeIndex;\n"
" \n"
" int leftCommonPrefix = (leftSplitIndex >= 0) ? commonPrefixLengths[leftSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" int rightCommonPrefix = (rightSplitIndex < numInternalNodes) ? commonPrefixLengths[rightSplitIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" \n"
" //Parent node is the highest adjacent common prefix that is lower than the node's common prefix\n"
" //Leaf nodes are considered as having the highest common prefix\n"
" int isLeftHigherCommonPrefix = (leftCommonPrefix > rightCommonPrefix);\n"
" \n"
" //Handle cases for the edge nodes; the first and last node\n"
" //For leaf nodes, leftCommonPrefix and rightCommonPrefix should never both be B3_PLBVH_INVALID_COMMON_PREFIX\n"
" if(leftCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = false;\n"
" if(rightCommonPrefix == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherCommonPrefix = true;\n"
" \n"
" int parentNodeIndex = (isLeftHigherCommonPrefix) ? leftSplitIndex : rightSplitIndex;\n"
" out_leafNodeParentNodes[leafNodeIndex] = parentNodeIndex;\n"
" \n"
" int isRightChild = (isLeftHigherCommonPrefix); //If the left node is the parent, then this node is its right child and vice versa\n"
" \n"
" //out_childNodesAsInt[0] == int2.x == left child\n"
" //out_childNodesAsInt[1] == int2.y == right child\n"
" int isLeaf = 1;\n"
" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, leafNodeIndex);\n"
"}\n"
"__kernel void buildBinaryRadixTreeInternalNodes(__global b3Int64* commonPrefixes, __global int* commonPrefixLengths,\n"
" __global int2* out_childNodes,\n"
" __global int* out_internalNodeParentNodes, __global int* out_rootNodeIndex,\n"
" int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_group_id(0) * get_local_size(0) + get_local_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" b3Int64 nodePrefix = commonPrefixes[internalNodeIndex];\n"
" int nodePrefixLength = commonPrefixLengths[internalNodeIndex];\n"
" \n"
"//#define USE_LINEAR_SEARCH\n"
"#ifdef USE_LINEAR_SEARCH\n"
" int leftIndex = -1;\n"
" int rightIndex = -1;\n"
" \n"
" //Find nearest element to left with a lower common prefix\n"
" for(int i = internalNodeIndex - 1; i >= 0; --i)\n"
" {\n"
" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
" if(nodeLeftSharedPrefixLength < nodePrefixLength)\n"
" {\n"
" leftIndex = i;\n"
" break;\n"
" }\n"
" }\n"
" \n"
" //Find nearest element to right with a lower common prefix\n"
" for(int i = internalNodeIndex + 1; i < numInternalNodes; ++i)\n"
" {\n"
" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, commonPrefixes[i], commonPrefixLengths[i]);\n"
" if(nodeRightSharedPrefixLength < nodePrefixLength)\n"
" {\n"
" rightIndex = i;\n"
" break;\n"
" }\n"
" }\n"
" \n"
"#else //Use binary search\n"
" //Find nearest element to left with a lower common prefix\n"
" int leftIndex = -1;\n"
" {\n"
" int lower = 0;\n"
" int upper = internalNodeIndex - 1;\n"
" \n"
" while(lower <= upper)\n"
" {\n"
" int mid = (lower + upper) / 2;\n"
" b3Int64 midPrefix = commonPrefixes[mid];\n"
" int midPrefixLength = commonPrefixLengths[mid];\n"
" \n"
" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" int right = mid + 1;\n"
" if(right < internalNodeIndex)\n"
" {\n"
" b3Int64 rightPrefix = commonPrefixes[right];\n"
" int rightPrefixLength = commonPrefixLengths[right];\n"
" \n"
" int nodeRightSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, rightPrefix, rightPrefixLength);\n"
" if(nodeRightSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" lower = right;\n"
" leftIndex = right;\n"
" }\n"
" else \n"
" {\n"
" leftIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else \n"
" {\n"
" leftIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else upper = mid - 1;\n"
" }\n"
" }\n"
" \n"
" //Find nearest element to right with a lower common prefix\n"
" int rightIndex = -1;\n"
" {\n"
" int lower = internalNodeIndex + 1;\n"
" int upper = numInternalNodes - 1;\n"
" \n"
" while(lower <= upper)\n"
" {\n"
" int mid = (lower + upper) / 2;\n"
" b3Int64 midPrefix = commonPrefixes[mid];\n"
" int midPrefixLength = commonPrefixLengths[mid];\n"
" \n"
" int nodeMidSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, midPrefix, midPrefixLength);\n"
" if(nodeMidSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" int left = mid - 1;\n"
" if(left > internalNodeIndex)\n"
" {\n"
" b3Int64 leftPrefix = commonPrefixes[left];\n"
" int leftPrefixLength = commonPrefixLengths[left];\n"
" \n"
" int nodeLeftSharedPrefixLength = getSharedPrefixLength(nodePrefix, nodePrefixLength, leftPrefix, leftPrefixLength);\n"
" if(nodeLeftSharedPrefixLength < nodePrefixLength) \n"
" {\n"
" upper = left;\n"
" rightIndex = left;\n"
" }\n"
" else \n"
" {\n"
" rightIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else \n"
" {\n"
" rightIndex = mid;\n"
" break;\n"
" }\n"
" }\n"
" else lower = mid + 1;\n"
" }\n"
" }\n"
"#endif\n"
" \n"
" //Select parent\n"
" {\n"
" int leftPrefixLength = (leftIndex != -1) ? commonPrefixLengths[leftIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" int rightPrefixLength = (rightIndex != -1) ? commonPrefixLengths[rightIndex] : B3_PLBVH_INVALID_COMMON_PREFIX;\n"
" \n"
" int isLeftHigherPrefixLength = (leftPrefixLength > rightPrefixLength);\n"
" \n"
" if(leftPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = false;\n"
" else if(rightPrefixLength == B3_PLBVH_INVALID_COMMON_PREFIX) isLeftHigherPrefixLength = true;\n"
" \n"
" int parentNodeIndex = (isLeftHigherPrefixLength) ? leftIndex : rightIndex;\n"
" \n"
" int isRootNode = (leftIndex == -1 && rightIndex == -1);\n"
" out_internalNodeParentNodes[internalNodeIndex] = (!isRootNode) ? parentNodeIndex : B3_PLBVH_ROOT_NODE_MARKER;\n"
" \n"
" int isLeaf = 0;\n"
" if(!isRootNode)\n"
" {\n"
" int isRightChild = (isLeftHigherPrefixLength); //If the left node is the parent, then this node is its right child and vice versa\n"
" \n"
" //out_childNodesAsInt[0] == int2.x == left child\n"
" //out_childNodesAsInt[1] == int2.y == right child\n"
" __global int* out_childNodesAsInt = (__global int*)(&out_childNodes[parentNodeIndex]);\n"
" out_childNodesAsInt[isRightChild] = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
" }\n"
" else *out_rootNodeIndex = getIndexWithInternalNodeMarkerSet(isLeaf, internalNodeIndex);\n"
" }\n"
"}\n"
"__kernel void findDistanceFromRoot(__global int* rootNodeIndex, __global int* internalNodeParentNodes,\n"
" __global int* out_maxDistanceFromRoot, __global int* out_distanceFromRoot, int numInternalNodes)\n"
"{\n"
" if( get_global_id(0) == 0 ) atomic_xchg(out_maxDistanceFromRoot, 0);\n"
" int internalNodeIndex = get_global_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" //\n"
" int distanceFromRoot = 0;\n"
" {\n"
" int parentIndex = internalNodeParentNodes[internalNodeIndex];\n"
" while(parentIndex != B3_PLBVH_ROOT_NODE_MARKER)\n"
" {\n"
" parentIndex = internalNodeParentNodes[parentIndex];\n"
" ++distanceFromRoot;\n"
" }\n"
" }\n"
" out_distanceFromRoot[internalNodeIndex] = distanceFromRoot;\n"
" \n"
" //\n"
" __local int localMaxDistanceFromRoot;\n"
" if( get_local_id(0) == 0 ) localMaxDistanceFromRoot = 0;\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" atomic_max(&localMaxDistanceFromRoot, distanceFromRoot);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if( get_local_id(0) == 0 ) atomic_max(out_maxDistanceFromRoot, localMaxDistanceFromRoot);\n"
"}\n"
"__kernel void buildBinaryRadixTreeAabbsRecursive(__global int* distanceFromRoot, __global SortDataCL* mortonCodesAndAabbIndices,\n"
" __global int2* childNodes,\n"
" __global b3AabbCL* leafNodeAabbs, __global b3AabbCL* internalNodeAabbs,\n"
" int maxDistanceFromRoot, int processedDistance, int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_global_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" int distance = distanceFromRoot[internalNodeIndex];\n"
" \n"
" if(distance == processedDistance)\n"
" {\n"
" int leftChildIndex = childNodes[internalNodeIndex].x;\n"
" int rightChildIndex = childNodes[internalNodeIndex].y;\n"
" \n"
" int isLeftChildLeaf = isLeafNode(leftChildIndex);\n"
" int isRightChildLeaf = isLeafNode(rightChildIndex);\n"
" \n"
" leftChildIndex = getIndexWithInternalNodeMarkerRemoved(leftChildIndex);\n"
" rightChildIndex = getIndexWithInternalNodeMarkerRemoved(rightChildIndex);\n"
" \n"
" //leftRigidIndex/rightRigidIndex is not used if internal node\n"
" int leftRigidIndex = (isLeftChildLeaf) ? mortonCodesAndAabbIndices[leftChildIndex].m_value : -1;\n"
" int rightRigidIndex = (isRightChildLeaf) ? mortonCodesAndAabbIndices[rightChildIndex].m_value : -1;\n"
" \n"
" b3AabbCL leftChildAabb = (isLeftChildLeaf) ? leafNodeAabbs[leftRigidIndex] : internalNodeAabbs[leftChildIndex];\n"
" b3AabbCL rightChildAabb = (isRightChildLeaf) ? leafNodeAabbs[rightRigidIndex] : internalNodeAabbs[rightChildIndex];\n"
" \n"
" b3AabbCL mergedAabb;\n"
" mergedAabb.m_min = b3Min(leftChildAabb.m_min, rightChildAabb.m_min);\n"
" mergedAabb.m_max = b3Max(leftChildAabb.m_max, rightChildAabb.m_max);\n"
" internalNodeAabbs[internalNodeIndex] = mergedAabb;\n"
" }\n"
"}\n"
"__kernel void findLeafIndexRanges(__global int2* internalNodeChildNodes, __global int2* out_leafIndexRanges, int numInternalNodes)\n"
"{\n"
" int internalNodeIndex = get_global_id(0);\n"
" if(internalNodeIndex >= numInternalNodes) return;\n"
" \n"
" int numLeafNodes = numInternalNodes + 1;\n"
" \n"
" int2 childNodes = internalNodeChildNodes[internalNodeIndex];\n"
" \n"
" int2 leafIndexRange; //x == min leaf index, y == max leaf index\n"
" \n"
" //Find lowest leaf index covered by this internal node\n"
" {\n"
" int lowestIndex = childNodes.x; //childNodes.x == Left child\n"
" while( !isLeafNode(lowestIndex) ) lowestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(lowestIndex) ].x;\n"
" leafIndexRange.x = lowestIndex;\n"
" }\n"
" \n"
" //Find highest leaf index covered by this internal node\n"
" {\n"
" int highestIndex = childNodes.y; //childNodes.y == Right child\n"
" while( !isLeafNode(highestIndex) ) highestIndex = internalNodeChildNodes[ getIndexWithInternalNodeMarkerRemoved(highestIndex) ].y;\n"
" leafIndexRange.y = highestIndex;\n"
" }\n"
" \n"
" //\n"
" out_leafIndexRanges[internalNodeIndex] = leafIndexRange;\n"
"}\n"
;

View File

@@ -8,6 +8,11 @@
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h"
#include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"
@@ -20,7 +25,24 @@ struct b3GpuRaycastInternalData
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_q;
cl_kernel m_raytraceKernel;
cl_kernel m_raytraceKernel;
cl_kernel m_raytracePairsKernel;
cl_kernel m_findRayRigidPairIndexRanges;
b3GpuParallelLinearBvh* m_plbvh;
b3RadixSort32CL* m_radixSorter;
b3FillCL* m_fill;
//1 element per ray
b3OpenCLArray<b3RayInfo>* m_gpuRays;
b3OpenCLArray<b3RayHit>* m_gpuHitResults;
b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
//1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB
b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index
int m_test;
};
@@ -31,7 +53,19 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
m_data->m_device = device;
m_data->m_q = q;
m_data->m_raytraceKernel = 0;
m_data->m_raytracePairsKernel = 0;
m_data->m_findRayRigidPairIndexRanges = 0;
m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
m_data->m_fill = new b3FillCL(ctx, device, q);
m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
m_data->m_numRayRigidPairsPerRay = new b3OpenCLArray<int>(ctx, q);
m_data->m_gpuNumRayRigidPairs = new b3OpenCLArray<int>(ctx, q);
m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);
{
cl_int errNum=0;
@@ -39,6 +73,10 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
b3Assert(errNum==CL_SUCCESS);
m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
clReleaseProgram(prog);
}
@@ -48,6 +86,20 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
b3GpuRaycast::~b3GpuRaycast()
{
clReleaseKernel(m_data->m_raytraceKernel);
clReleaseKernel(m_data->m_raytracePairsKernel);
clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
delete m_data->m_plbvh;
delete m_data->m_radixSorter;
delete m_data->m_fill;
delete m_data->m_gpuRays;
delete m_data->m_gpuHitResults;
delete m_data->m_firstRayRigidPairIndexPerRay;
delete m_data->m_numRayRigidPairsPerRay;
delete m_data->m_gpuNumRayRigidPairs;
delete m_data->m_gpuRayRigidPairs;
delete m_data;
}
@@ -206,27 +258,32 @@ void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3A
}
///todo: add some acceleration structure (AABBs, tree etc)
void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase)
{
//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);
B3_PROFILE("castRaysGPU");
b3OpenCLArray<b3RayInfo> gpuRays(m_data->m_context,m_data->m_q);
b3OpenCLArray<b3RayHit> gpuHitResults(m_data->m_context,m_data->m_q);
{
B3_PROFILE("raycast copyFromHost");
gpuRays.copyFromHost(rays);
gpuHitResults.resize(hitResults.size());
gpuHitResults.copyFromHost(hitResults);
m_data->m_gpuRays->copyFromHost(rays);
m_data->m_gpuHitResults->copyFromHost(hitResults);
}
int numRays = hitResults.size();
{
m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
m_data->m_numRayRigidPairsPerRay->resize(numRays);
m_data->m_gpuNumRayRigidPairs->resize(1);
m_data->m_gpuRayRigidPairs->resize(numRays * 16);
}
//run kernel
const bool USE_BRUTE_FORCE_RAYCAST = false;
if(USE_BRUTE_FORCE_RAYCAST)
{
B3_PROFILE("raycast launch1D");
@@ -234,8 +291,8 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3Align
int numRays = rays.size();
launcher.setConst(numRays);
launcher.setBuffer(gpuRays.getBufferCL());
launcher.setBuffer(gpuHitResults.getBufferCL());
launcher.setBuffer(m_data->m_gpuRays->getBufferCL());
launcher.setBuffer(m_data->m_gpuHitResults->getBufferCL());
launcher.setConst(numBodies);
launcher.setBuffer(narrowphaseData->m_bodyBufferGPU->getBufferCL());
@@ -246,11 +303,89 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3Align
launcher.launch1D(numRays);
clFinish(m_data->m_q);
}
else
{
m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() );
m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
int numRayRigidPairs = -1;
m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() )
{
numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
}
m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct
//Sort ray-rigid pairs by ray index
{
B3_PROFILE("sort ray-rigid pairs");
m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) );
}
//detect start,count of each ray pair
{
B3_PROFILE("detect ray-rigid pair index ranges");
{
B3_PROFILE("reset ray-rigid pair index ranges");
m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index
m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
clFinish(m_data->m_q);
}
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ),
b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() )
};
b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numRayRigidPairs);
launcher.launch1D(numRayRigidPairs);
clFinish(m_data->m_q);
}
{
B3_PROFILE("ray-rigid intersection");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ),
b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ),
b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ),
b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() )
};
b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_data->m_q);
}
}
//copy results
{
B3_PROFILE("raycast copyToHost");
gpuHitResults.copyToHost(hitResults);
m_data->m_gpuHitResults->copyToHost(hitResults);
}
}

View File

@@ -23,8 +23,7 @@ public:
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData
);
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);

View File

@@ -337,3 +337,103 @@ __kernel void rayCastKernel(
}
}
__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs,
__global int* out_firstRayRigidPairIndexPerRay,
__global int* out_numRayRigidPairsPerRay,
int numRayRigidPairs)
{
int rayRigidPairIndex = get_global_id(0);
if (rayRigidPairIndex >= numRayRigidPairs) return;
int rayIndex = rayRigidPairs[rayRigidPairIndex].x;
atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);
atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);
}
__kernel void rayCastPairsKernel(const __global b3RayInfo* rays,
__global b3RayHit* hitResults,
__global int* firstRayRigidPairIndexPerRay,
__global int* numRayRigidPairsPerRay,
__global Body* bodies,
__global Collidable* collidables,
__global const b3GpuFace* faces,
__global const ConvexPolyhedronCL* convexShapes,
__global int2* rayRigidPairs,
int numRays)
{
int i = get_global_id(0);
if (i >= numRays) return;
float4 rayFrom = rays[i].m_from;
float4 rayTo = rays[i].m_to;
hitResults[i].m_hitFraction = 1.f;
float hitFraction = 1.f;
float4 hitPoint;
float4 hitNormal;
int hitBodyIndex = -1;
//
for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)
{
int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];
int b = rayRigidPairs[rayRigidPairIndex].y;
if (hitResults[i].m_hitResult2 == b) continue;
Body body = bodies[b];
Collidable rigidCollidable = collidables[body.m_collidableIdx];
float4 pos = body.m_pos;
float4 orn = body.m_quat;
if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)
{
float4 invPos = (float4)(0,0,0,0);
float4 invOrn = (float4)(0,0,0,0);
float4 rayFromLocal = (float4)(0,0,0,0);
float4 rayToLocal = (float4)(0,0,0,0);
invOrn = qtInvert(orn);
invPos = qtRotate(invOrn, -pos);
rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;
rayToLocal = qtRotate( invOrn, rayTo) + invPos;
rayFromLocal.w = 0.f;
rayToLocal.w = 0.f;
int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;
int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;
if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))
{
hitBodyIndex = b;
hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
}
}
if (rigidCollidable.m_shapeType == SHAPE_SPHERE)
{
float radius = rigidCollidable.m_radius;
if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))
{
hitBodyIndex = b;
hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);
hitNormal = (float4) (hitPoint - bodies[b].m_pos);
}
}
}
if (hitBodyIndex >= 0)
{
hitResults[i].m_hitFraction = hitFraction;
hitResults[i].m_hitPoint = hitPoint;
hitResults[i].m_hitNormal = normalize(hitNormal);
hitResults[i].m_hitResult0 = hitBodyIndex;
}
}

View File

@@ -281,4 +281,101 @@ static const char* rayCastKernelCL= \
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
"}\n"
"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
" __global int* out_firstRayRigidPairIndexPerRay,\n"
" __global int* out_numRayRigidPairsPerRay,\n"
" int numRayRigidPairs)\n"
"{\n"
" int rayRigidPairIndex = get_global_id(0);\n"
" if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
" \n"
" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
" \n"
" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
"}\n"
"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
" __global b3RayHit* hitResults, \n"
" __global int* firstRayRigidPairIndexPerRay,\n"
" __global int* numRayRigidPairsPerRay,\n"
" \n"
" __global Body* bodies,\n"
" __global Collidable* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const ConvexPolyhedronCL* convexShapes,\n"
" \n"
" __global int2* rayRigidPairs,\n"
" int numRays)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i >= numRays) return;\n"
" \n"
" float4 rayFrom = rays[i].m_from;\n"
" float4 rayTo = rays[i].m_to;\n"
" \n"
" hitResults[i].m_hitFraction = 1.f;\n"
" \n"
" float hitFraction = 1.f;\n"
" float4 hitPoint;\n"
" float4 hitNormal;\n"
" int hitBodyIndex = -1;\n"
" \n"
" //\n"
" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
" {\n"
" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
" int b = rayRigidPairs[rayRigidPairIndex].y;\n"
" \n"
" if (hitResults[i].m_hitResult2 == b) continue;\n"
" \n"
" Body body = bodies[b];\n"
" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
" \n"
" float4 pos = body.m_pos;\n"
" float4 orn = body.m_quat;\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
" {\n"
" float4 invPos = (float4)(0,0,0,0);\n"
" float4 invOrn = (float4)(0,0,0,0);\n"
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
" float4 rayToLocal = (float4)(0,0,0,0);\n"
" invOrn = qtInvert(orn);\n"
" invPos = qtRotate(invOrn, -pos);\n"
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
" \n"
" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" }\n"
" }\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
" {\n"
" float radius = rigidCollidable.m_radius;\n"
" \n"
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
" }\n"
" }\n"
" }\n"
" \n"
" if (hitBodyIndex >= 0)\n"
" {\n"
" hitResults[i].m_hitFraction = hitFraction;\n"
" hitResults[i].m_hitPoint = hitPoint;\n"
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
" \n"
"}\n"
;

View File

@@ -703,6 +703,6 @@ void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& ray
{
this->m_data->m_raycaster->castRays(rays,hitResults,
getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(),
m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getInternalData()
);
m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
}