Code-style consistency improvement:
Apply clang-format-all.sh using the _clang-format file through all the cpp/.h files. make sure not to apply it to certain serialization structures, since some parser expects the * as part of the name, instead of type. This commit contains no other changes aside from adding and applying clang-format-all.sh
This commit is contained in:
@@ -12,33 +12,31 @@
|
||||
class b3GpuBroadphaseInterface
|
||||
{
|
||||
public:
|
||||
|
||||
typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q);
|
||||
typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);
|
||||
|
||||
virtual ~b3GpuBroadphaseInterface()
|
||||
{
|
||||
}
|
||||
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
|
||||
|
||||
virtual void calculateOverlappingPairs(int maxPairs)=0;
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs)=0;
|
||||
virtual void calculateOverlappingPairs(int maxPairs) = 0;
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs) = 0;
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
virtual void writeAabbsToGpu()=0;
|
||||
virtual void writeAabbsToGpu() = 0;
|
||||
|
||||
virtual cl_mem getAabbBufferWS()=0;
|
||||
virtual int getNumOverlap()=0;
|
||||
virtual cl_mem getOverlappingPairBuffer()=0;
|
||||
virtual cl_mem getAabbBufferWS() = 0;
|
||||
virtual int getNumOverlap() = 0;
|
||||
virtual cl_mem getOverlappingPairBuffer() = 0;
|
||||
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;
|
||||
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0;
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0;
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
|
||||
|
||||
};
|
||||
|
||||
#endif //B3_GPU_BROADPHASE_INTERFACE_H
|
||||
#endif //B3_GPU_BROADPHASE_INTERFACE_H
|
||||
|
||||
@@ -5,12 +5,9 @@
|
||||
#include "kernels/sapKernels.h"
|
||||
//#include "kernels/gridBroadphase.cl"
|
||||
|
||||
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
|
||||
|
||||
|
||||
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
|
||||
#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
|
||||
|
||||
@@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs;
|
||||
cl_kernel m_copyAabbsKernel;
|
||||
cl_kernel m_sap2Kernel;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//int maxPairsPerBody = 64;
|
||||
int maxBodiesPerCell = 256;//??
|
||||
int maxBodiesPerCell = 256; //??
|
||||
|
||||
b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
|
||||
:m_context(ctx),
|
||||
m_device(device),
|
||||
m_queue(q),
|
||||
m_allAabbsGPU1(ctx,q),
|
||||
m_smallAabbsMappingGPU(ctx,q),
|
||||
m_largeAabbsMappingGPU(ctx,q),
|
||||
m_gpuPairs(ctx,q),
|
||||
b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
|
||||
: m_context(ctx),
|
||||
m_device(device),
|
||||
m_queue(q),
|
||||
m_allAabbsGPU1(ctx, q),
|
||||
m_smallAabbsMappingGPU(ctx, q),
|
||||
m_largeAabbsMappingGPU(ctx, q),
|
||||
m_gpuPairs(ctx, q),
|
||||
|
||||
m_hashGpu(ctx,q),
|
||||
m_hashGpu(ctx, q),
|
||||
|
||||
m_cellStartGpu(ctx,q),
|
||||
m_paramsGPU(ctx,q)
|
||||
m_cellStartGpu(ctx, q),
|
||||
m_paramsGPU(ctx, q)
|
||||
{
|
||||
|
||||
|
||||
b3Vector3 gridSize = b3MakeVector3(3,3,3);
|
||||
b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]);
|
||||
b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
|
||||
b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);
|
||||
|
||||
m_paramsCPU.m_gridSize[0] = 128;
|
||||
m_paramsCPU.m_gridSize[1] = 128;
|
||||
@@ -58,92 +49,79 @@ m_paramsGPU(ctx,q)
|
||||
m_paramsCPU.m_invCellSize[3] = 0.f;
|
||||
m_paramsGPU.push_back(m_paramsCPU);
|
||||
|
||||
cl_int errNum=0;
|
||||
cl_int errNum = 0;
|
||||
|
||||
{
|
||||
const char* sapSrc = sapCL;
|
||||
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
|
||||
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
|
||||
b3Assert(errNum == CL_SUCCESS);
|
||||
m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
|
||||
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
|
||||
b3Assert(errNum == CL_SUCCESS);
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
|
||||
b3Assert(errNum == CL_SUCCESS);
|
||||
|
||||
kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
|
||||
b3Assert(errNum == CL_SUCCESS);
|
||||
|
||||
kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
|
||||
b3Assert(errNum == CL_SUCCESS);
|
||||
|
||||
|
||||
kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
|
||||
b3Assert(errNum == CL_SUCCESS);
|
||||
|
||||
|
||||
|
||||
|
||||
kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
|
||||
b3Assert(errNum == CL_SUCCESS);
|
||||
}
|
||||
|
||||
m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
|
||||
|
||||
m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
|
||||
}
|
||||
b3GpuGridBroadphase::~b3GpuGridBroadphase()
|
||||
{
|
||||
clReleaseKernel( kCalcHashAABB);
|
||||
clReleaseKernel( kClearCellStart);
|
||||
clReleaseKernel( kFindCellStart);
|
||||
clReleaseKernel( kFindOverlappingPairs);
|
||||
clReleaseKernel( m_sap2Kernel);
|
||||
clReleaseKernel( m_copyAabbsKernel);
|
||||
|
||||
|
||||
|
||||
clReleaseKernel(kCalcHashAABB);
|
||||
clReleaseKernel(kClearCellStart);
|
||||
clReleaseKernel(kFindCellStart);
|
||||
clReleaseKernel(kFindOverlappingPairs);
|
||||
clReleaseKernel(m_sap2Kernel);
|
||||
clReleaseKernel(m_copyAabbsKernel);
|
||||
|
||||
delete m_sorter;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
|
||||
void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
|
||||
{
|
||||
b3SapAabb aabb;
|
||||
aabb.m_minVec = aabbMin;
|
||||
aabb.m_maxVec = aabbMax;
|
||||
aabb.m_minIndices[3] = userPtr;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
|
||||
m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
|
||||
|
||||
m_allAabbsCPU1.push_back(aabb);
|
||||
|
||||
}
|
||||
void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
|
||||
void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
|
||||
{
|
||||
b3SapAabb aabb;
|
||||
aabb.m_minVec = aabbMin;
|
||||
aabb.m_maxVec = aabbMax;
|
||||
aabb.m_minIndices[3] = userPtr;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
|
||||
m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
|
||||
|
||||
m_allAabbsCPU1.push_back(aabb);
|
||||
}
|
||||
|
||||
void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
|
||||
void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
|
||||
{
|
||||
B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
|
||||
|
||||
|
||||
if (0)
|
||||
{
|
||||
calculateOverlappingPairsHost(maxPairs);
|
||||
/*
|
||||
/*
|
||||
b3AlignedObjectArray<b3Int4> cpuPairs;
|
||||
m_gpuPairs.copyToHost(cpuPairs);
|
||||
printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
|
||||
@@ -154,57 +132,50 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
int numSmallAabbs = m_smallAabbsMappingGPU.size();
|
||||
|
||||
b3OpenCLArray<int> pairCount(m_context,m_queue);
|
||||
b3OpenCLArray<int> pairCount(m_context, m_queue);
|
||||
pairCount.push_back(0);
|
||||
m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody);
|
||||
m_gpuPairs.resize(maxPairs); //numSmallAabbs*maxPairsPerBody);
|
||||
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsMappingGPU.size();
|
||||
if (numLargeAabbs && numSmallAabbs)
|
||||
{
|
||||
B3_PROFILE("sap2Kernel");
|
||||
b3BufferInfoCL bInfo[] = {
|
||||
b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ),
|
||||
b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
|
||||
b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ),
|
||||
b3BufferInfoCL( m_gpuPairs.getBufferCL() ),
|
||||
b3BufferInfoCL bInfo[] = {
|
||||
b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
|
||||
b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
|
||||
b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
|
||||
b3BufferInfoCL(m_gpuPairs.getBufferCL()),
|
||||
b3BufferInfoCL(pairCount.getBufferCL())};
|
||||
b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst( numLargeAabbs );
|
||||
launcher.setConst( numSmallAabbs);
|
||||
launcher.setConst( 0 );//axis is not used
|
||||
launcher.setConst( maxPairs );
|
||||
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
|
||||
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
|
||||
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numLargeAabbs);
|
||||
launcher.setConst(numSmallAabbs);
|
||||
launcher.setConst(0); //axis is not used
|
||||
launcher.setConst(maxPairs);
|
||||
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
|
||||
launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
|
||||
|
||||
int numPairs = pairCount.at(0);
|
||||
|
||||
if (numPairs >maxPairs)
|
||||
|
||||
if (numPairs > maxPairs)
|
||||
{
|
||||
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
|
||||
numPairs =maxPairs;
|
||||
numPairs = maxPairs;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
if (numSmallAabbs)
|
||||
{
|
||||
B3_PROFILE("gridKernel");
|
||||
m_hashGpu.resize(numSmallAabbs);
|
||||
{
|
||||
B3_PROFILE("kCalcHashAABB");
|
||||
b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB");
|
||||
b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
|
||||
launch.setConst(numSmallAabbs);
|
||||
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
|
||||
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
|
||||
@@ -214,117 +185,104 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
|
||||
}
|
||||
|
||||
m_sorter->execute(m_hashGpu);
|
||||
|
||||
int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2];
|
||||
|
||||
int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
|
||||
m_cellStartGpu.resize(numCells);
|
||||
//b3AlignedObjectArray<int > cellStartCpu;
|
||||
|
||||
|
||||
|
||||
{
|
||||
B3_PROFILE("kClearCellStart");
|
||||
b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart");
|
||||
b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
|
||||
launch.setConst(numCells);
|
||||
launch.setBuffer(m_cellStartGpu.getBufferCL());
|
||||
launch.launch1D(numCells);
|
||||
//m_cellStartGpu.copyToHost(cellStartCpu);
|
||||
//printf("??\n");
|
||||
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
B3_PROFILE("kFindCellStart");
|
||||
b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart");
|
||||
b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
|
||||
launch.setConst(numSmallAabbs);
|
||||
launch.setBuffer(m_hashGpu.getBufferCL());
|
||||
launch.setBuffer(m_cellStartGpu.getBufferCL());
|
||||
launch.launch1D(numSmallAabbs);
|
||||
//m_cellStartGpu.copyToHost(cellStartCpu);
|
||||
//printf("??\n");
|
||||
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
B3_PROFILE("kFindOverlappingPairs");
|
||||
|
||||
|
||||
b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs");
|
||||
|
||||
b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
|
||||
launch.setConst(numSmallAabbs);
|
||||
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
|
||||
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
|
||||
launch.setBuffer(m_hashGpu.getBufferCL());
|
||||
launch.setBuffer(m_cellStartGpu.getBufferCL());
|
||||
|
||||
|
||||
launch.setBuffer(m_paramsGPU.getBufferCL());
|
||||
//launch.setBuffer(0);
|
||||
launch.setBuffer(pairCount.getBufferCL());
|
||||
launch.setBuffer(m_gpuPairs.getBufferCL());
|
||||
|
||||
|
||||
launch.setConst(maxPairs);
|
||||
launch.launch1D(numSmallAabbs);
|
||||
|
||||
|
||||
int numPairs = pairCount.at(0);
|
||||
if (numPairs >maxPairs)
|
||||
if (numPairs > maxPairs)
|
||||
{
|
||||
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
|
||||
numPairs =maxPairs;
|
||||
numPairs = maxPairs;
|
||||
}
|
||||
|
||||
|
||||
m_gpuPairs.resize(numPairs);
|
||||
|
||||
|
||||
if (0)
|
||||
{
|
||||
b3AlignedObjectArray<b3Int4> pairsCpu;
|
||||
m_gpuPairs.copyToHost(pairsCpu);
|
||||
|
||||
int sz = m_gpuPairs.size();
|
||||
printf("m_gpuPairs.size()=%d\n",sz);
|
||||
for (int i=0;i<m_gpuPairs.size();i++)
|
||||
printf("m_gpuPairs.size()=%d\n", sz);
|
||||
for (int i = 0; i < m_gpuPairs.size(); i++)
|
||||
{
|
||||
printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y);
|
||||
printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
|
||||
}
|
||||
|
||||
printf("?!?\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//calculateOverlappingPairsHost(maxPairs);
|
||||
}
|
||||
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
|
||||
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
|
||||
{
|
||||
|
||||
m_hostPairs.resize(0);
|
||||
m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
|
||||
for (int i=0;i<m_allAabbsCPU1.size();i++)
|
||||
for (int i = 0; i < m_allAabbsCPU1.size(); i++)
|
||||
{
|
||||
for (int j=i+1;j<m_allAabbsCPU1.size();j++)
|
||||
for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
|
||||
{
|
||||
if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
|
||||
m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec))
|
||||
m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
|
||||
{
|
||||
b3Int4 pair;
|
||||
int a = m_allAabbsCPU1[j].m_minIndices[3];
|
||||
int b = m_allAabbsCPU1[i].m_minIndices[3];
|
||||
if (a<=b)
|
||||
if (a <= b)
|
||||
{
|
||||
pair.x = a;
|
||||
pair.y = b;//store the original index in the unsorted aabb array
|
||||
} else
|
||||
pair.x = a;
|
||||
pair.y = b; //store the original index in the unsorted aabb array
|
||||
}
|
||||
else
|
||||
{
|
||||
pair.x = b;
|
||||
pair.y = a;//store the original index in the unsorted aabb array
|
||||
pair.y = a; //store the original index in the unsorted aabb array
|
||||
}
|
||||
|
||||
if (m_hostPairs.size()<maxPairs)
|
||||
|
||||
if (m_hostPairs.size() < maxPairs)
|
||||
{
|
||||
m_hostPairs.push_back(pair);
|
||||
}
|
||||
@@ -332,40 +290,36 @@ void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
m_gpuPairs.copyFromHost(m_hostPairs);
|
||||
|
||||
|
||||
}
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
void b3GpuGridBroadphase::writeAabbsToGpu()
|
||||
{
|
||||
m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
|
||||
m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
|
||||
m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
|
||||
|
||||
}
|
||||
|
||||
cl_mem b3GpuGridBroadphase::getAabbBufferWS()
|
||||
cl_mem b3GpuGridBroadphase::getAabbBufferWS()
|
||||
{
|
||||
return this->m_allAabbsGPU1.getBufferCL();
|
||||
}
|
||||
int b3GpuGridBroadphase::getNumOverlap()
|
||||
int b3GpuGridBroadphase::getNumOverlap()
|
||||
{
|
||||
return m_gpuPairs.size();
|
||||
}
|
||||
cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
|
||||
cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
|
||||
{
|
||||
return m_gpuPairs.getBufferCL();
|
||||
}
|
||||
|
||||
b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
|
||||
b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
|
||||
{
|
||||
return m_allAabbsGPU1;
|
||||
}
|
||||
|
||||
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
|
||||
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
|
||||
{
|
||||
return m_allAabbsCPU1;
|
||||
}
|
||||
@@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
|
||||
{
|
||||
return m_largeAabbsMappingGPU;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,83 +6,75 @@
|
||||
|
||||
struct b3ParamsGridBroadphaseCL
|
||||
{
|
||||
|
||||
float m_invCellSize[4];
|
||||
int m_gridSize[4];
|
||||
int m_gridSize[4];
|
||||
|
||||
int getMaxBodiesPerCell() const
|
||||
int getMaxBodiesPerCell() const
|
||||
{
|
||||
return m_gridSize[3];
|
||||
}
|
||||
|
||||
void setMaxBodiesPerCell(int maxOverlap)
|
||||
void setMaxBodiesPerCell(int maxOverlap)
|
||||
{
|
||||
m_gridSize[3] = maxOverlap;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
|
||||
{
|
||||
protected:
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
|
||||
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
|
||||
b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
|
||||
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
|
||||
|
||||
b3OpenCLArray<int> m_smallAabbsMappingGPU;
|
||||
b3OpenCLArray<int> m_smallAabbsMappingGPU;
|
||||
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
|
||||
|
||||
b3OpenCLArray<int> m_largeAabbsMappingGPU;
|
||||
b3OpenCLArray<int> m_largeAabbsMappingGPU;
|
||||
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
|
||||
|
||||
b3AlignedObjectArray<b3Int4> m_hostPairs;
|
||||
b3OpenCLArray<b3Int4> m_gpuPairs;
|
||||
b3OpenCLArray<b3Int4> m_gpuPairs;
|
||||
|
||||
b3OpenCLArray<b3SortData> m_hashGpu;
|
||||
b3OpenCLArray<int> m_cellStartGpu;
|
||||
|
||||
b3OpenCLArray<b3SortData> m_hashGpu;
|
||||
b3OpenCLArray<int> m_cellStartGpu;
|
||||
|
||||
b3ParamsGridBroadphaseCL m_paramsCPU;
|
||||
b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
|
||||
b3ParamsGridBroadphaseCL m_paramsCPU;
|
||||
b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
|
||||
|
||||
class b3RadixSort32CL* m_sorter;
|
||||
class b3RadixSort32CL* m_sorter;
|
||||
|
||||
public:
|
||||
|
||||
b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
|
||||
b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
|
||||
virtual ~b3GpuGridBroadphase();
|
||||
|
||||
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q)
|
||||
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
|
||||
{
|
||||
return new b3GpuGridBroadphase(ctx,device,q);
|
||||
return new b3GpuGridBroadphase(ctx, device, q);
|
||||
}
|
||||
|
||||
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
|
||||
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
|
||||
|
||||
virtual void calculateOverlappingPairs(int maxPairs);
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs);
|
||||
virtual void calculateOverlappingPairs(int maxPairs);
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs);
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
virtual void writeAabbsToGpu();
|
||||
|
||||
virtual cl_mem getAabbBufferWS();
|
||||
virtual int getNumOverlap();
|
||||
virtual cl_mem getOverlappingPairBuffer();
|
||||
virtual cl_mem getAabbBufferWS();
|
||||
virtual int getNumOverlap();
|
||||
virtual cl_mem getOverlappingPairBuffer();
|
||||
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
|
||||
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
|
||||
|
||||
};
|
||||
|
||||
#endif //B3_GPU_GRID_BROADPHASE_H
|
||||
#endif //B3_GPU_GRID_BROADPHASE_H
|
||||
@@ -16,177 +16,174 @@ subject to the following restrictions:
|
||||
|
||||
#include "b3GpuParallelLinearBvh.h"
|
||||
|
||||
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
|
||||
m_queue(queue),
|
||||
m_radixSorter(context, device, queue),
|
||||
|
||||
m_rootNodeIndex(context, queue),
|
||||
m_maxDistanceFromRoot(context, queue),
|
||||
m_temp(context, queue),
|
||||
|
||||
m_internalNodeAabbs(context, queue),
|
||||
m_internalNodeLeafIndexRanges(context, queue),
|
||||
m_internalNodeChildNodes(context, queue),
|
||||
m_internalNodeParentNodes(context, queue),
|
||||
|
||||
m_commonPrefixes(context, queue),
|
||||
m_commonPrefixLengths(context, queue),
|
||||
m_distanceFromRoot(context, queue),
|
||||
|
||||
m_leafNodeParentNodes(context, queue),
|
||||
m_mortonCodesAndAabbIndicies(context, queue),
|
||||
m_mergedAabb(context, queue),
|
||||
m_leafNodeAabbs(context, queue),
|
||||
|
||||
m_largeAabbs(context, queue)
|
||||
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
|
||||
m_radixSorter(context, device, queue),
|
||||
|
||||
m_rootNodeIndex(context, queue),
|
||||
m_maxDistanceFromRoot(context, queue),
|
||||
m_temp(context, queue),
|
||||
|
||||
m_internalNodeAabbs(context, queue),
|
||||
m_internalNodeLeafIndexRanges(context, queue),
|
||||
m_internalNodeChildNodes(context, queue),
|
||||
m_internalNodeParentNodes(context, queue),
|
||||
|
||||
m_commonPrefixes(context, queue),
|
||||
m_commonPrefixLengths(context, queue),
|
||||
m_distanceFromRoot(context, queue),
|
||||
|
||||
m_leafNodeParentNodes(context, queue),
|
||||
m_mortonCodesAndAabbIndicies(context, queue),
|
||||
m_mergedAabb(context, queue),
|
||||
m_leafNodeAabbs(context, queue),
|
||||
|
||||
m_largeAabbs(context, queue)
|
||||
{
|
||||
m_rootNodeIndex.resize(1);
|
||||
m_maxDistanceFromRoot.resize(1);
|
||||
m_temp.resize(1);
|
||||
|
||||
|
||||
//
|
||||
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
|
||||
|
||||
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
|
||||
|
||||
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
|
||||
cl_int error;
|
||||
char* additionalMacros = 0;
|
||||
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
|
||||
b3Assert(m_parallelLinearBvhProgram);
|
||||
|
||||
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
|
||||
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_separateAabbsKernel);
|
||||
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_findAllNodesMergedAabbKernel);
|
||||
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
|
||||
|
||||
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
|
||||
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_computeAdjacentPairCommonPrefixKernel);
|
||||
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
|
||||
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
|
||||
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_findDistanceFromRootKernel);
|
||||
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
|
||||
|
||||
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
|
||||
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_findLeafIndexRangesKernel);
|
||||
|
||||
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
|
||||
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
|
||||
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_plbvhRayTraverseKernel);
|
||||
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_plbvhLargeAabbAabbTestKernel);
|
||||
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
|
||||
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
|
||||
b3Assert(m_plbvhLargeAabbRayTestKernel);
|
||||
}
|
||||
|
||||
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
|
||||
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
|
||||
{
|
||||
clReleaseKernel(m_separateAabbsKernel);
|
||||
clReleaseKernel(m_findAllNodesMergedAabbKernel);
|
||||
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
|
||||
|
||||
|
||||
clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
|
||||
clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
|
||||
clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
|
||||
clReleaseKernel(m_findDistanceFromRootKernel);
|
||||
clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
|
||||
|
||||
|
||||
clReleaseKernel(m_findLeafIndexRangesKernel);
|
||||
|
||||
|
||||
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
|
||||
clReleaseKernel(m_plbvhRayTraverseKernel);
|
||||
clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
|
||||
clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
|
||||
|
||||
|
||||
clReleaseProgram(m_parallelLinearBvhProgram);
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
|
||||
const b3OpenCLArray<int>& largeAabbIndices)
|
||||
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
|
||||
const b3OpenCLArray<int>& largeAabbIndices)
|
||||
{
|
||||
B3_PROFILE("b3ParallelLinearBvh::build()");
|
||||
|
||||
|
||||
int numLargeAabbs = largeAabbIndices.size();
|
||||
int numSmallAabbs = smallAabbIndices.size();
|
||||
|
||||
//Since all AABBs(both large and small) are input as a contiguous array,
|
||||
|
||||
//Since all AABBs(both large and small) are input as a contiguous array,
|
||||
//with 2 additional arrays used to indicate the indices of large and small AABBs,
|
||||
//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
|
||||
{
|
||||
B3_PROFILE("Separate large and small AABBs");
|
||||
|
||||
|
||||
m_largeAabbs.resize(numLargeAabbs);
|
||||
m_leafNodeAabbs.resize(numSmallAabbs);
|
||||
|
||||
|
||||
//Write large AABBs into m_largeAabbs
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_largeAabbs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(largeAabbIndices.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(m_largeAabbs.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numLargeAabbs);
|
||||
|
||||
|
||||
launcher.launch1D(numLargeAabbs);
|
||||
}
|
||||
|
||||
|
||||
//Write small AABBs into m_leafNodeAabbs
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(smallAabbIndices.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numSmallAabbs);
|
||||
|
||||
|
||||
launcher.launch1D(numSmallAabbs);
|
||||
}
|
||||
|
||||
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
|
||||
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
|
||||
int numInternalNodes = numLeaves - 1;
|
||||
|
||||
if(numLeaves < 2)
|
||||
|
||||
if (numLeaves < 2)
|
||||
{
|
||||
//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
|
||||
//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
|
||||
int rootNodeIndex = numLeaves - 1;
|
||||
m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
|
||||
|
||||
|
||||
//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
|
||||
//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
|
||||
//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
|
||||
//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
|
||||
if(numLeaves == 1)
|
||||
if (numLeaves == 1)
|
||||
{
|
||||
b3SortData leaf;
|
||||
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
|
||||
|
||||
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
|
||||
|
||||
m_mortonCodesAndAabbIndicies.resize(1);
|
||||
m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
|
||||
}
|
||||
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
{
|
||||
m_internalNodeAabbs.resize(numInternalNodes);
|
||||
@@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
|
||||
m_commonPrefixes.resize(numInternalNodes);
|
||||
m_commonPrefixLengths.resize(numInternalNodes);
|
||||
m_distanceFromRoot.resize(numInternalNodes);
|
||||
|
||||
|
||||
m_leafNodeParentNodes.resize(numLeaves);
|
||||
m_mortonCodesAndAabbIndicies.resize(numLeaves);
|
||||
m_mergedAabb.resize(numLeaves);
|
||||
}
|
||||
|
||||
//Find the merged AABB of all small AABBs; this is used to define the size of
|
||||
|
||||
//Find the merged AABB of all small AABBs; this is used to define the size of
|
||||
//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
|
||||
{
|
||||
B3_PROFILE("Find AABB of merged nodes");
|
||||
|
||||
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
|
||||
|
||||
for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
|
||||
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
|
||||
|
||||
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
|
||||
|
||||
for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
|
||||
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0]
|
||||
};
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0]
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numAabbsNeedingMerge);
|
||||
|
||||
|
||||
launcher.launch1D(numAabbsNeedingMerge);
|
||||
}
|
||||
|
||||
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//Insert the center of the AABBs into a virtual grid,
|
||||
//then convert the discrete grid coordinates into a morton code
|
||||
//For each element in m_mortonCodesAndAabbIndicies, set
|
||||
@@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
|
||||
// m_value == small AABB index
|
||||
{
|
||||
B3_PROFILE("Assign morton codes");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(m_mergedAabb.getBufferCL()),
|
||||
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numLeaves);
|
||||
|
||||
|
||||
launcher.launch1D(numLeaves);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
{
|
||||
B3_PROFILE("Sort leaves by morton codes");
|
||||
|
||||
|
||||
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
constructBinaryRadixTree();
|
||||
|
||||
|
||||
|
||||
//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
|
||||
//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
|
||||
//The child nodes of each node split their parent's index range into 2 contiguous halves.
|
||||
@@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
|
||||
//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
|
||||
{
|
||||
B3_PROFILE("m_findLeafIndexRangesKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
@@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou
|
||||
{
|
||||
int maxPairs = out_overlappingPairs.size();
|
||||
b3OpenCLArray<int>& numPairsGpu = m_temp;
|
||||
|
||||
|
||||
int reset = 0;
|
||||
numPairsGpu.copyFromHostPointer(&reset, 1);
|
||||
|
||||
|
||||
//
|
||||
if( m_leafNodeAabbs.size() > 1 )
|
||||
if (m_leafNodeAabbs.size() > 1)
|
||||
{
|
||||
B3_PROFILE("PLBVH small-small AABB test");
|
||||
|
||||
|
||||
int numQueryAabbs = m_leafNodeAabbs.size();
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
|
||||
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
|
||||
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(numPairsGpu.getBufferCL()),
|
||||
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(maxPairs);
|
||||
launcher.setConst(numQueryAabbs);
|
||||
|
||||
|
||||
launcher.launch1D(numQueryAabbs);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
int numLargeAabbRigids = m_largeAabbs.size();
|
||||
if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
|
||||
if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
|
||||
{
|
||||
B3_PROFILE("PLBVH large-small AABB test");
|
||||
|
||||
|
||||
int numQueryAabbs = m_leafNodeAabbs.size();
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
|
||||
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(numPairsGpu.getBufferCL()),
|
||||
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(maxPairs);
|
||||
launcher.setConst(numLargeAabbRigids);
|
||||
launcher.setConst(numQueryAabbs);
|
||||
|
||||
|
||||
launcher.launch1D(numQueryAabbs);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//
|
||||
int numPairs = -1;
|
||||
numPairsGpu.copyToHostPointer(&numPairs, 1);
|
||||
if(numPairs > maxPairs)
|
||||
if (numPairs > maxPairs)
|
||||
{
|
||||
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
|
||||
numPairs = maxPairs;
|
||||
numPairsGpu.copyFromHostPointer(&maxPairs, 1);
|
||||
}
|
||||
|
||||
|
||||
out_overlappingPairs.resize(numPairs);
|
||||
}
|
||||
|
||||
|
||||
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
|
||||
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
|
||||
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
|
||||
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
|
||||
{
|
||||
B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
|
||||
|
||||
|
||||
int numRays = rays.size();
|
||||
int maxRayRigidPairs = out_rayRigidPairs.size();
|
||||
|
||||
|
||||
int reset = 0;
|
||||
out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
|
||||
|
||||
|
||||
//
|
||||
if( m_leafNodeAabbs.size() > 0 )
|
||||
if (m_leafNodeAabbs.size() > 0)
|
||||
{
|
||||
B3_PROFILE("PLBVH ray test small AABB");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( rays.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
|
||||
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
|
||||
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(rays.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
|
||||
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(maxRayRigidPairs);
|
||||
launcher.setConst(numRays);
|
||||
|
||||
|
||||
launcher.launch1D(numRays);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
int numLargeAabbRigids = m_largeAabbs.size();
|
||||
if(numLargeAabbRigids > 0)
|
||||
if (numLargeAabbRigids > 0)
|
||||
{
|
||||
B3_PROFILE("PLBVH ray test large AABB");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( rays.getBufferCL() ),
|
||||
|
||||
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
|
||||
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(rays.getBufferCL()),
|
||||
|
||||
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
|
||||
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numLargeAabbRigids);
|
||||
launcher.setConst(maxRayRigidPairs);
|
||||
launcher.setConst(numRays);
|
||||
|
||||
|
||||
launcher.launch1D(numRays);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
int numRayRigidPairs = -1;
|
||||
out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
|
||||
|
||||
if(numRayRigidPairs > maxRayRigidPairs)
|
||||
|
||||
if (numRayRigidPairs > maxRayRigidPairs)
|
||||
b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
|
||||
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvh::constructBinaryRadixTree()
|
||||
{
|
||||
B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
|
||||
|
||||
|
||||
int numLeaves = m_leafNodeAabbs.size();
|
||||
int numInternalNodes = numLeaves - 1;
|
||||
|
||||
|
||||
//Each internal node is placed in between 2 leaf nodes.
|
||||
//By using this arrangement and computing the common prefix between
|
||||
//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
|
||||
{
|
||||
B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
|
||||
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
|
||||
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
//For each leaf node, select its parent node by
|
||||
|
||||
//For each leaf node, select its parent node by
|
||||
//comparing the 2 nearest internal nodes and assign child node indices
|
||||
{
|
||||
B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
|
||||
b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
|
||||
b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numLeaves);
|
||||
|
||||
|
||||
launcher.launch1D(numLeaves);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//For each internal node, perform 2 binary searches among the other internal nodes
|
||||
//to its left and right to find its potential parent nodes and assign child node indices
|
||||
{
|
||||
B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
|
||||
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//Find the number of nodes seperating each internal node and the root node
|
||||
//so that the AABBs can be set using the next kernel.
|
||||
//Also determine the maximum number of nodes separating an internal node and the root node.
|
||||
{
|
||||
B3_PROFILE("m_findDistanceFromRootKernel");
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
|
||||
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
|
||||
};
|
||||
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
|
||||
b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
|
||||
launcher.launch1D(numInternalNodes);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
|
||||
//Starting from the internal nodes nearest to the leaf nodes, recursively move up
|
||||
//the tree towards the root to set the AABBs of each internal node; each internal node
|
||||
//checks its children and merges their AABBs
|
||||
{
|
||||
B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
|
||||
|
||||
|
||||
int maxDistanceFromRoot = -1;
|
||||
{
|
||||
B3_PROFILE("copy maxDistanceFromRoot to CPU");
|
||||
m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
|
||||
|
||||
for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
|
||||
{
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
|
||||
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
|
||||
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
|
||||
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
|
||||
};
|
||||
|
||||
b3BufferInfoCL bufferInfo[] =
|
||||
{
|
||||
b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
|
||||
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
|
||||
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
|
||||
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
|
||||
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
|
||||
launcher.setConst(maxDistanceFromRoot);
|
||||
launcher.setConst(distanceFromRoot);
|
||||
launcher.setConst(numInternalNodes);
|
||||
|
||||
|
||||
//It may seem inefficent to launch a thread for each internal node when a
|
||||
//much smaller number of nodes is actually processed, but this is actually
|
||||
//faster than determining the exact nodes that are ready to merge their child AABBs.
|
||||
//faster than determining the exact nodes that are ready to merge their child AABBs.
|
||||
launcher.launch1D(numInternalNodes);
|
||||
}
|
||||
|
||||
|
||||
clFinish(m_queue);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,10 +37,10 @@ subject to the following restrictions:
|
||||
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
|
||||
///@par
|
||||
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
|
||||
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
|
||||
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
|
||||
/// - [fully parallel] Sort morton codes
|
||||
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
|
||||
/// - [somewhat parallel] Set internal node AABBs
|
||||
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
|
||||
/// - [somewhat parallel] Set internal node AABBs
|
||||
///@par
|
||||
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
|
||||
///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
|
||||
@@ -49,75 +49,75 @@ subject to the following restrictions:
|
||||
class b3GpuParallelLinearBvh
|
||||
{
|
||||
cl_command_queue m_queue;
|
||||
|
||||
|
||||
cl_program m_parallelLinearBvhProgram;
|
||||
|
||||
|
||||
cl_kernel m_separateAabbsKernel;
|
||||
cl_kernel m_findAllNodesMergedAabbKernel;
|
||||
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
|
||||
|
||||
|
||||
//Binary radix tree construction kernels
|
||||
cl_kernel m_computeAdjacentPairCommonPrefixKernel;
|
||||
cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
|
||||
cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
|
||||
cl_kernel m_findDistanceFromRootKernel;
|
||||
cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
|
||||
|
||||
|
||||
cl_kernel m_findLeafIndexRangesKernel;
|
||||
|
||||
|
||||
//Traversal kernels
|
||||
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
|
||||
cl_kernel m_plbvhRayTraverseKernel;
|
||||
cl_kernel m_plbvhLargeAabbAabbTestKernel;
|
||||
cl_kernel m_plbvhLargeAabbRayTestKernel;
|
||||
|
||||
|
||||
b3RadixSort32CL m_radixSorter;
|
||||
|
||||
|
||||
//1 element
|
||||
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
|
||||
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
|
||||
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
|
||||
|
||||
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
|
||||
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
|
||||
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
|
||||
|
||||
//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
|
||||
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
|
||||
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
|
||||
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
|
||||
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
|
||||
|
||||
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
|
||||
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
|
||||
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
|
||||
|
||||
//1 element per internal node; for binary radix tree construction
|
||||
b3OpenCLArray<b3Int64> m_commonPrefixes;
|
||||
b3OpenCLArray<int> m_commonPrefixLengths;
|
||||
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
|
||||
|
||||
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
|
||||
|
||||
//1 element per leaf node (leaf nodes only include small AABBs)
|
||||
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
|
||||
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
|
||||
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
|
||||
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
|
||||
|
||||
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
|
||||
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
|
||||
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
|
||||
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
|
||||
|
||||
//1 element per large AABB, which is not stored in the BVH
|
||||
b3OpenCLArray<b3SapAabb> m_largeAabbs;
|
||||
|
||||
|
||||
public:
|
||||
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
|
||||
virtual ~b3GpuParallelLinearBvh();
|
||||
|
||||
|
||||
///Must be called before any other function
|
||||
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
|
||||
const b3OpenCLArray<int>& largeAabbIndices);
|
||||
|
||||
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
|
||||
const b3OpenCLArray<int>& largeAabbIndices);
|
||||
|
||||
///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
|
||||
///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
|
||||
///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
|
||||
void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
|
||||
|
||||
|
||||
///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
|
||||
///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
|
||||
///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
|
||||
///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
|
||||
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
|
||||
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
|
||||
|
||||
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
|
||||
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
|
||||
|
||||
private:
|
||||
void constructBinaryRadixTree();
|
||||
};
|
||||
|
||||
@@ -13,45 +13,44 @@ subject to the following restrictions:
|
||||
|
||||
#include "b3GpuParallelLinearBvhBroadphase.h"
|
||||
|
||||
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :
|
||||
m_plbvh(context, device, queue),
|
||||
|
||||
m_overlappingPairsGpu(context, queue),
|
||||
|
||||
m_aabbsGpu(context, queue),
|
||||
m_smallAabbsMappingGpu(context, queue),
|
||||
m_largeAabbsMappingGpu(context, queue)
|
||||
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue),
|
||||
|
||||
m_overlappingPairsGpu(context, queue),
|
||||
|
||||
m_aabbsGpu(context, queue),
|
||||
m_smallAabbsMappingGpu(context, queue),
|
||||
m_largeAabbsMappingGpu(context, queue)
|
||||
{
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
|
||||
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
|
||||
{
|
||||
int newAabbIndex = m_aabbsCpu.size();
|
||||
|
||||
b3SapAabb aabb;
|
||||
aabb.m_minVec = aabbMin;
|
||||
aabb.m_maxVec = aabbMax;
|
||||
|
||||
|
||||
aabb.m_minIndices[3] = userPtr;
|
||||
aabb.m_signedMaxIndices[3] = newAabbIndex;
|
||||
|
||||
|
||||
m_smallAabbsMappingCpu.push_back(newAabbIndex);
|
||||
|
||||
|
||||
m_aabbsCpu.push_back(aabb);
|
||||
}
|
||||
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
|
||||
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
|
||||
{
|
||||
int newAabbIndex = m_aabbsCpu.size();
|
||||
|
||||
b3SapAabb aabb;
|
||||
aabb.m_minVec = aabbMin;
|
||||
aabb.m_maxVec = aabbMax;
|
||||
|
||||
|
||||
aabb.m_minIndices[3] = userPtr;
|
||||
aabb.m_signedMaxIndices[3] = newAabbIndex;
|
||||
|
||||
|
||||
m_largeAabbsMappingCpu.push_back(newAabbIndex);
|
||||
|
||||
|
||||
m_aabbsCpu.push_back(aabb);
|
||||
}
|
||||
|
||||
@@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
|
||||
{
|
||||
//Reconstruct BVH
|
||||
m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
|
||||
|
||||
|
||||
//
|
||||
m_overlappingPairsGpu.resize(maxPairs);
|
||||
m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
|
||||
}
|
||||
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
|
||||
{
|
||||
b3Assert(0); //CPU version not implemented
|
||||
b3Assert(0); //CPU version not implemented
|
||||
}
|
||||
|
||||
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
|
||||
{
|
||||
m_aabbsGpu.copyFromHost(m_aabbsCpu);
|
||||
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
|
||||
{
|
||||
m_aabbsGpu.copyFromHost(m_aabbsCpu);
|
||||
m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
|
||||
m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -21,42 +21,42 @@ subject to the following restrictions:
|
||||
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
|
||||
{
|
||||
b3GpuParallelLinearBvh m_plbvh;
|
||||
|
||||
|
||||
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
|
||||
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
|
||||
b3OpenCLArray<int> m_smallAabbsMappingGpu;
|
||||
b3OpenCLArray<int> m_largeAabbsMappingGpu;
|
||||
|
||||
|
||||
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
|
||||
b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
|
||||
b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
|
||||
|
||||
|
||||
public:
|
||||
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
|
||||
virtual ~b3GpuParallelLinearBvhBroadphase() {}
|
||||
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
|
||||
virtual void calculateOverlappingPairs(int maxPairs);
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs);
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
virtual void writeAabbsToGpu();
|
||||
|
||||
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
|
||||
|
||||
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
|
||||
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
|
||||
|
||||
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
|
||||
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
|
||||
|
||||
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
|
||||
|
||||
|
||||
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
|
||||
{
|
||||
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -2,7 +2,7 @@
|
||||
#define B3_GPU_SAP_BROADPHASE_H
|
||||
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
|
||||
class b3Vector3;
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
|
||||
@@ -11,141 +11,133 @@ class b3Vector3;
|
||||
|
||||
#include "b3GpuBroadphaseInterface.h"
|
||||
|
||||
|
||||
class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
|
||||
{
|
||||
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
cl_kernel m_flipFloatKernel;
|
||||
cl_kernel m_scatterKernel ;
|
||||
cl_kernel m_copyAabbsKernel;
|
||||
cl_kernel m_sapKernel;
|
||||
cl_kernel m_sap2Kernel;
|
||||
cl_kernel m_prepareSumVarianceKernel;
|
||||
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
cl_kernel m_flipFloatKernel;
|
||||
cl_kernel m_scatterKernel;
|
||||
cl_kernel m_copyAabbsKernel;
|
||||
cl_kernel m_sapKernel;
|
||||
cl_kernel m_sap2Kernel;
|
||||
cl_kernel m_prepareSumVarianceKernel;
|
||||
|
||||
class b3RadixSort32CL* m_sorter;
|
||||
|
||||
///test for 3d SAP
|
||||
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
|
||||
b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
|
||||
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
|
||||
b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
|
||||
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
|
||||
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
|
||||
b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
|
||||
|
||||
b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
|
||||
b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
|
||||
b3OpenCLArray<int> m_addedCountGPU;
|
||||
b3OpenCLArray<int> m_removedCountGPU;
|
||||
|
||||
b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
|
||||
b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
|
||||
b3OpenCLArray<int> m_addedCountGPU;
|
||||
b3OpenCLArray<int> m_removedCountGPU;
|
||||
|
||||
int m_currentBuffer;
|
||||
int m_currentBuffer;
|
||||
|
||||
public:
|
||||
|
||||
b3OpenCLArray<int> m_pairCount;
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
|
||||
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
|
||||
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
|
||||
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
|
||||
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
|
||||
{
|
||||
return m_allAabbsGPU;
|
||||
}
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
|
||||
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
|
||||
{
|
||||
return m_allAabbsCPU;
|
||||
}
|
||||
|
||||
b3OpenCLArray<b3Vector3> m_sum;
|
||||
b3OpenCLArray<b3Vector3> m_sum2;
|
||||
b3OpenCLArray<b3Vector3> m_dst;
|
||||
b3OpenCLArray<b3Vector3> m_sum;
|
||||
b3OpenCLArray<b3Vector3> m_sum2;
|
||||
b3OpenCLArray<b3Vector3> m_dst;
|
||||
|
||||
b3OpenCLArray<int> m_smallAabbsMappingGPU;
|
||||
b3OpenCLArray<int> m_smallAabbsMappingGPU;
|
||||
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
|
||||
|
||||
b3OpenCLArray<int> m_largeAabbsMappingGPU;
|
||||
b3OpenCLArray<int> m_largeAabbsMappingGPU;
|
||||
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
|
||||
|
||||
|
||||
b3OpenCLArray<b3Int4> m_overlappingPairs;
|
||||
b3OpenCLArray<b3Int4> m_overlappingPairs;
|
||||
|
||||
//temporary gpu work memory
|
||||
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
|
||||
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
|
||||
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
|
||||
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
|
||||
|
||||
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
|
||||
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
|
||||
|
||||
enum b3GpuSapKernelType
|
||||
{
|
||||
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1,
|
||||
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,
|
||||
B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
|
||||
B3_GPU_SAP_KERNEL_ORIGINAL,
|
||||
B3_GPU_SAP_KERNEL_BARRIER,
|
||||
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
|
||||
};
|
||||
|
||||
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
|
||||
b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
|
||||
virtual ~b3GpuSapBroadphase();
|
||||
|
||||
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q)
|
||||
|
||||
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)
|
||||
{
|
||||
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
|
||||
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
|
||||
}
|
||||
|
||||
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q)
|
||||
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)
|
||||
{
|
||||
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
|
||||
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
|
||||
}
|
||||
|
||||
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q)
|
||||
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)
|
||||
{
|
||||
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL);
|
||||
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);
|
||||
}
|
||||
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q)
|
||||
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)
|
||||
{
|
||||
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER);
|
||||
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);
|
||||
}
|
||||
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q)
|
||||
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)
|
||||
{
|
||||
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
|
||||
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
|
||||
}
|
||||
|
||||
|
||||
virtual void calculateOverlappingPairs(int maxPairs);
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs);
|
||||
|
||||
void reset();
|
||||
virtual void calculateOverlappingPairs(int maxPairs);
|
||||
virtual void calculateOverlappingPairsHost(int maxPairs);
|
||||
|
||||
void reset();
|
||||
|
||||
void init3dSap();
|
||||
virtual void calculateOverlappingPairsHostIncremental3Sap();
|
||||
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
|
||||
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
virtual void writeAabbsToGpu();
|
||||
|
||||
virtual cl_mem getAabbBufferWS();
|
||||
virtual int getNumOverlap();
|
||||
virtual cl_mem getOverlappingPairBuffer();
|
||||
|
||||
virtual cl_mem getAabbBufferWS();
|
||||
virtual int getNumOverlap();
|
||||
virtual cl_mem getOverlappingPairBuffer();
|
||||
|
||||
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
|
||||
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
|
||||
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
|
||||
};
|
||||
|
||||
#endif //B3_GPU_SAP_BROADPHASE_H
|
||||
#endif //B3_GPU_SAP_BROADPHASE_H
|
||||
@@ -5,10 +5,9 @@
|
||||
#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
|
||||
|
||||
///just make sure that the b3Aabb is 16-byte aligned
|
||||
B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb
|
||||
{
|
||||
B3_ATTRIBUTE_ALIGNED16(struct)
|
||||
b3SapAabb : public b3Aabb{
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
#endif //B3_SAP_AABB_H
|
||||
#endif //B3_SAP_AABB_H
|
||||
|
||||
@@ -1,199 +1,198 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* gridBroadphaseCL= \
|
||||
"int getPosHash(int4 gridPos, __global float4* pParams)\n"
|
||||
"{\n"
|
||||
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
|
||||
" gridPos.x &= gridDim.x - 1;\n"
|
||||
" gridPos.y &= gridDim.y - 1;\n"
|
||||
" gridPos.z &= gridDim.z - 1;\n"
|
||||
" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
|
||||
" return hash;\n"
|
||||
"} \n"
|
||||
"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
|
||||
"{\n"
|
||||
" int4 gridPos;\n"
|
||||
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
|
||||
" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
|
||||
" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
|
||||
" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
|
||||
" return gridPos;\n"
|
||||
"}\n"
|
||||
"// calculate grid hash value for each body using its AABB\n"
|
||||
"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
|
||||
"{\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" if(index >= numObjects)\n"
|
||||
" {\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
|
||||
" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
|
||||
" float4 pos;\n"
|
||||
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
|
||||
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
|
||||
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
|
||||
" pos.w = 0.f;\n"
|
||||
" // get address in grid\n"
|
||||
" int4 gridPos = getGridPos(pos, pParams);\n"
|
||||
" int gridHash = getPosHash(gridPos, pParams);\n"
|
||||
" // store grid hash and body index\n"
|
||||
" int2 hashVal;\n"
|
||||
" hashVal.x = gridHash;\n"
|
||||
" hashVal.y = index;\n"
|
||||
" pHash[index] = hashVal;\n"
|
||||
"}\n"
|
||||
"__kernel void kClearCellStart( int numCells, \n"
|
||||
" __global int* pCellStart )\n"
|
||||
"{\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" if(index >= numCells)\n"
|
||||
" {\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" pCellStart[index] = -1;\n"
|
||||
"}\n"
|
||||
"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
|
||||
"{\n"
|
||||
" __local int sharedHash[513];\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" int2 sortedData;\n"
|
||||
" if(index < numObjects)\n"
|
||||
" {\n"
|
||||
" sortedData = pHash[index];\n"
|
||||
" // Load hash data into shared memory so that we can look \n"
|
||||
" // at neighboring body's hash value without loading\n"
|
||||
" // two hash values per thread\n"
|
||||
" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
|
||||
" if((index > 0) && (get_local_id(0) == 0))\n"
|
||||
" {\n"
|
||||
" // first thread in block must load neighbor body hash\n"
|
||||
" sharedHash[0] = pHash[index-1].x;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" if(index < numObjects)\n"
|
||||
" {\n"
|
||||
" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
|
||||
" {\n"
|
||||
" cellStart[sortedData.x] = index;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
|
||||
"{\n"
|
||||
" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
|
||||
" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
|
||||
" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
|
||||
"}\n"
|
||||
"//search for AABB 'index' against other AABBs' in this cell\n"
|
||||
"void findPairsInCell( int numObjects,\n"
|
||||
" int4 gridPos,\n"
|
||||
" int index,\n"
|
||||
" __global int2* pHash,\n"
|
||||
" __global int* pCellStart,\n"
|
||||
" __global float4* allpAABB, \n"
|
||||
" __global const int* smallAabbMapping,\n"
|
||||
" __global float4* pParams,\n"
|
||||
" volatile __global int* pairCount,\n"
|
||||
" __global int4* pPairBuff2,\n"
|
||||
" int maxPairs\n"
|
||||
" )\n"
|
||||
"{\n"
|
||||
" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
|
||||
" int maxBodiesPerCell = pGridDim.w;\n"
|
||||
" int gridHash = getPosHash(gridPos, pParams);\n"
|
||||
" // get start of bucket for this cell\n"
|
||||
" int bucketStart = pCellStart[gridHash];\n"
|
||||
" if (bucketStart == -1)\n"
|
||||
" {\n"
|
||||
" return; // cell empty\n"
|
||||
" }\n"
|
||||
" // iterate over bodies in this cell\n"
|
||||
" int2 sortedData = pHash[index];\n"
|
||||
" int unsorted_indx = sortedData.y;\n"
|
||||
" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
|
||||
" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
|
||||
" int handleIndex = as_int(min0.w);\n"
|
||||
" \n"
|
||||
" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
|
||||
" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
|
||||
" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
|
||||
" {\n"
|
||||
" int2 cellData = pHash[index2];\n"
|
||||
" if (cellData.x != gridHash)\n"
|
||||
" {\n"
|
||||
" break; // no longer in same bucket\n"
|
||||
" }\n"
|
||||
" int unsorted_indx2 = cellData.y;\n"
|
||||
" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
|
||||
" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
|
||||
" { \n"
|
||||
" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
|
||||
" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
|
||||
" if(testAABBOverlap(min0, max0, min1, max1))\n"
|
||||
" {\n"
|
||||
" if (pairCount)\n"
|
||||
" {\n"
|
||||
" int handleIndex2 = as_int(min1.w);\n"
|
||||
" if (handleIndex<handleIndex2)\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_add(pairCount,1);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" int4 newpair;\n"
|
||||
" newpair.x = handleIndex;\n"
|
||||
" newpair.y = handleIndex2;\n"
|
||||
" newpair.z = -1;\n"
|
||||
" newpair.w = -1;\n"
|
||||
" pPairBuff2[curPair] = newpair;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void kFindOverlappingPairs( int numObjects,\n"
|
||||
" __global float4* allpAABB, \n"
|
||||
" __global const int* smallAabbMapping,\n"
|
||||
" __global int2* pHash, \n"
|
||||
" __global int* pCellStart, \n"
|
||||
" __global float4* pParams ,\n"
|
||||
" volatile __global int* pairCount,\n"
|
||||
" __global int4* pPairBuff2,\n"
|
||||
" int maxPairs\n"
|
||||
" )\n"
|
||||
"{\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" if(index >= numObjects)\n"
|
||||
" {\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" int2 sortedData = pHash[index];\n"
|
||||
" int unsorted_indx = sortedData.y;\n"
|
||||
" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
|
||||
" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
|
||||
" float4 pos;\n"
|
||||
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
|
||||
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
|
||||
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
|
||||
" // get address in grid\n"
|
||||
" int4 gridPosA = getGridPos(pos, pParams);\n"
|
||||
" int4 gridPosB; \n"
|
||||
" // examine only neighbouring cells\n"
|
||||
" for(int z=-1; z<=1; z++) \n"
|
||||
" {\n"
|
||||
" gridPosB.z = gridPosA.z + z;\n"
|
||||
" for(int y=-1; y<=1; y++) \n"
|
||||
" {\n"
|
||||
" gridPosB.y = gridPosA.y + y;\n"
|
||||
" for(int x=-1; x<=1; x++) \n"
|
||||
" {\n"
|
||||
" gridPosB.x = gridPosA.x + x;\n"
|
||||
" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
static const char* gridBroadphaseCL =
|
||||
"int getPosHash(int4 gridPos, __global float4* pParams)\n"
|
||||
"{\n"
|
||||
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
|
||||
" gridPos.x &= gridDim.x - 1;\n"
|
||||
" gridPos.y &= gridDim.y - 1;\n"
|
||||
" gridPos.z &= gridDim.z - 1;\n"
|
||||
" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
|
||||
" return hash;\n"
|
||||
"} \n"
|
||||
"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
|
||||
"{\n"
|
||||
" int4 gridPos;\n"
|
||||
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
|
||||
" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
|
||||
" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
|
||||
" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
|
||||
" return gridPos;\n"
|
||||
"}\n"
|
||||
"// calculate grid hash value for each body using its AABB\n"
|
||||
"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
|
||||
"{\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" if(index >= numObjects)\n"
|
||||
" {\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
|
||||
" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
|
||||
" float4 pos;\n"
|
||||
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
|
||||
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
|
||||
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
|
||||
" pos.w = 0.f;\n"
|
||||
" // get address in grid\n"
|
||||
" int4 gridPos = getGridPos(pos, pParams);\n"
|
||||
" int gridHash = getPosHash(gridPos, pParams);\n"
|
||||
" // store grid hash and body index\n"
|
||||
" int2 hashVal;\n"
|
||||
" hashVal.x = gridHash;\n"
|
||||
" hashVal.y = index;\n"
|
||||
" pHash[index] = hashVal;\n"
|
||||
"}\n"
|
||||
"__kernel void kClearCellStart( int numCells, \n"
|
||||
" __global int* pCellStart )\n"
|
||||
"{\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" if(index >= numCells)\n"
|
||||
" {\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" pCellStart[index] = -1;\n"
|
||||
"}\n"
|
||||
"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
|
||||
"{\n"
|
||||
" __local int sharedHash[513];\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" int2 sortedData;\n"
|
||||
" if(index < numObjects)\n"
|
||||
" {\n"
|
||||
" sortedData = pHash[index];\n"
|
||||
" // Load hash data into shared memory so that we can look \n"
|
||||
" // at neighboring body's hash value without loading\n"
|
||||
" // two hash values per thread\n"
|
||||
" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
|
||||
" if((index > 0) && (get_local_id(0) == 0))\n"
|
||||
" {\n"
|
||||
" // first thread in block must load neighbor body hash\n"
|
||||
" sharedHash[0] = pHash[index-1].x;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" if(index < numObjects)\n"
|
||||
" {\n"
|
||||
" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
|
||||
" {\n"
|
||||
" cellStart[sortedData.x] = index;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
|
||||
"{\n"
|
||||
" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
|
||||
" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
|
||||
" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
|
||||
"}\n"
|
||||
"//search for AABB 'index' against other AABBs' in this cell\n"
|
||||
"void findPairsInCell( int numObjects,\n"
|
||||
" int4 gridPos,\n"
|
||||
" int index,\n"
|
||||
" __global int2* pHash,\n"
|
||||
" __global int* pCellStart,\n"
|
||||
" __global float4* allpAABB, \n"
|
||||
" __global const int* smallAabbMapping,\n"
|
||||
" __global float4* pParams,\n"
|
||||
" volatile __global int* pairCount,\n"
|
||||
" __global int4* pPairBuff2,\n"
|
||||
" int maxPairs\n"
|
||||
" )\n"
|
||||
"{\n"
|
||||
" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
|
||||
" int maxBodiesPerCell = pGridDim.w;\n"
|
||||
" int gridHash = getPosHash(gridPos, pParams);\n"
|
||||
" // get start of bucket for this cell\n"
|
||||
" int bucketStart = pCellStart[gridHash];\n"
|
||||
" if (bucketStart == -1)\n"
|
||||
" {\n"
|
||||
" return; // cell empty\n"
|
||||
" }\n"
|
||||
" // iterate over bodies in this cell\n"
|
||||
" int2 sortedData = pHash[index];\n"
|
||||
" int unsorted_indx = sortedData.y;\n"
|
||||
" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
|
||||
" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
|
||||
" int handleIndex = as_int(min0.w);\n"
|
||||
" \n"
|
||||
" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
|
||||
" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
|
||||
" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
|
||||
" {\n"
|
||||
" int2 cellData = pHash[index2];\n"
|
||||
" if (cellData.x != gridHash)\n"
|
||||
" {\n"
|
||||
" break; // no longer in same bucket\n"
|
||||
" }\n"
|
||||
" int unsorted_indx2 = cellData.y;\n"
|
||||
" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
|
||||
" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
|
||||
" { \n"
|
||||
" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
|
||||
" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
|
||||
" if(testAABBOverlap(min0, max0, min1, max1))\n"
|
||||
" {\n"
|
||||
" if (pairCount)\n"
|
||||
" {\n"
|
||||
" int handleIndex2 = as_int(min1.w);\n"
|
||||
" if (handleIndex<handleIndex2)\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_add(pairCount,1);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" int4 newpair;\n"
|
||||
" newpair.x = handleIndex;\n"
|
||||
" newpair.y = handleIndex2;\n"
|
||||
" newpair.z = -1;\n"
|
||||
" newpair.w = -1;\n"
|
||||
" pPairBuff2[curPair] = newpair;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void kFindOverlappingPairs( int numObjects,\n"
|
||||
" __global float4* allpAABB, \n"
|
||||
" __global const int* smallAabbMapping,\n"
|
||||
" __global int2* pHash, \n"
|
||||
" __global int* pCellStart, \n"
|
||||
" __global float4* pParams ,\n"
|
||||
" volatile __global int* pairCount,\n"
|
||||
" __global int4* pPairBuff2,\n"
|
||||
" int maxPairs\n"
|
||||
" )\n"
|
||||
"{\n"
|
||||
" int index = get_global_id(0);\n"
|
||||
" if(index >= numObjects)\n"
|
||||
" {\n"
|
||||
" return;\n"
|
||||
" }\n"
|
||||
" int2 sortedData = pHash[index];\n"
|
||||
" int unsorted_indx = sortedData.y;\n"
|
||||
" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
|
||||
" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
|
||||
" float4 pos;\n"
|
||||
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
|
||||
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
|
||||
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
|
||||
" // get address in grid\n"
|
||||
" int4 gridPosA = getGridPos(pos, pParams);\n"
|
||||
" int4 gridPosB; \n"
|
||||
" // examine only neighbouring cells\n"
|
||||
" for(int z=-1; z<=1; z++) \n"
|
||||
" {\n"
|
||||
" gridPosB.z = gridPosA.z + z;\n"
|
||||
" for(int y=-1; y<=1; y++) \n"
|
||||
" {\n"
|
||||
" gridPosB.y = gridPosA.y + y;\n"
|
||||
" for(int x=-1; x<=1; x++) \n"
|
||||
" {\n"
|
||||
" gridPosB.x = gridPosA.x + x;\n"
|
||||
" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,342 +1,341 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* sapCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Erwin Coumans\n"
|
||||
"#define NEW_PAIR_MARKER -1\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} btAabbCL;\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numUnsortedAabbs)\n"
|
||||
" return;\n"
|
||||
" int j = get_global_id(1);\n"
|
||||
" if (j>=numUnSortedAabbs2)\n"
|
||||
" return;\n"
|
||||
" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
|
||||
" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" \n"
|
||||
" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
|
||||
" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
|
||||
" if (xIndex>yIndex)\n"
|
||||
" {\n"
|
||||
" int tmp = xIndex;\n"
|
||||
" xIndex=yIndex;\n"
|
||||
" yIndex=tmp;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" myPair.x = xIndex;\n"
|
||||
" myPair.y = yIndex;\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" for (int j=i+1;j<numObjects;j++)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" for (int j=i+1;j<numObjects;j++)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" btAabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
" \n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" int localCount=0;\n"
|
||||
" int block=0;\n"
|
||||
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" \n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = myAabb.m_minIndices[3];\n"
|
||||
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" localCount++;\n"
|
||||
" if (localCount==64)\n"
|
||||
" {\n"
|
||||
" localCount = 0;\n"
|
||||
" block+=64; \n"
|
||||
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" \n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"//http://stereopsis.com/radix.html\n"
|
||||
"unsigned int FloatFlip(float fl);\n"
|
||||
"unsigned int FloatFlip(float fl)\n"
|
||||
"{\n"
|
||||
" unsigned int f = *(unsigned int*)&fl;\n"
|
||||
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
|
||||
" return f ^ mask;\n"
|
||||
"}\n"
|
||||
"float IFloatFlip(unsigned int f);\n"
|
||||
"float IFloatFlip(unsigned int f)\n"
|
||||
"{\n"
|
||||
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
|
||||
" unsigned int fl = f ^ mask;\n"
|
||||
" return *(float*)&fl;\n"
|
||||
"}\n"
|
||||
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" int src = destAabbs[i].m_maxIndices[3];\n"
|
||||
" destAabbs[i] = allAabbs[src];\n"
|
||||
" destAabbs[i].m_maxIndices[3] = src;\n"
|
||||
"}\n"
|
||||
"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
|
||||
" sortData[i].y = i;\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
|
||||
"}\n"
|
||||
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numAabbs)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
|
||||
" \n"
|
||||
" float4 s;\n"
|
||||
" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
|
||||
" sum[i]=s;\n"
|
||||
" sum2[i]=s*s; \n"
|
||||
"}\n"
|
||||
;
|
||||
static const char* sapCL =
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Erwin Coumans\n"
|
||||
"#define NEW_PAIR_MARKER -1\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} btAabbCL;\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numUnsortedAabbs)\n"
|
||||
" return;\n"
|
||||
" int j = get_global_id(1);\n"
|
||||
" if (j>=numUnSortedAabbs2)\n"
|
||||
" return;\n"
|
||||
" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
|
||||
" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" \n"
|
||||
" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
|
||||
" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
|
||||
" if (xIndex>yIndex)\n"
|
||||
" {\n"
|
||||
" int tmp = xIndex;\n"
|
||||
" xIndex=yIndex;\n"
|
||||
" yIndex=tmp;\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" myPair.x = xIndex;\n"
|
||||
" myPair.y = yIndex;\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" for (int j=i+1;j<numObjects;j++)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" for (int j=i+1;j<numObjects;j++)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
"}\n"
|
||||
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" btAabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
" \n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" int localCount=0;\n"
|
||||
" int block=0;\n"
|
||||
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" \n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
|
||||
" {\n"
|
||||
" int4 myPair;\n"
|
||||
" myPair.x = myAabb.m_minIndices[3];\n"
|
||||
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
|
||||
" myPair.z = NEW_PAIR_MARKER;\n"
|
||||
" myPair.w = NEW_PAIR_MARKER;\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" localCount++;\n"
|
||||
" if (localCount==64)\n"
|
||||
" {\n"
|
||||
" localCount = 0;\n"
|
||||
" block+=64; \n"
|
||||
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" \n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"//http://stereopsis.com/radix.html\n"
|
||||
"unsigned int FloatFlip(float fl);\n"
|
||||
"unsigned int FloatFlip(float fl)\n"
|
||||
"{\n"
|
||||
" unsigned int f = *(unsigned int*)&fl;\n"
|
||||
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
|
||||
" return f ^ mask;\n"
|
||||
"}\n"
|
||||
"float IFloatFlip(unsigned int f);\n"
|
||||
"float IFloatFlip(unsigned int f)\n"
|
||||
"{\n"
|
||||
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
|
||||
" unsigned int fl = f ^ mask;\n"
|
||||
" return *(float*)&fl;\n"
|
||||
"}\n"
|
||||
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" int src = destAabbs[i].m_maxIndices[3];\n"
|
||||
" destAabbs[i] = allAabbs[src];\n"
|
||||
" destAabbs[i].m_maxIndices[3] = src;\n"
|
||||
"}\n"
|
||||
"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
|
||||
" sortData[i].y = i;\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
|
||||
"}\n"
|
||||
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numAabbs)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
|
||||
" \n"
|
||||
" float4 s;\n"
|
||||
" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
|
||||
" sum[i]=s;\n"
|
||||
" sum2[i]=s*s; \n"
|
||||
"}\n";
|
||||
|
||||
Reference in New Issue
Block a user