Code-style consistency improvement:

Apply clang-format-all.sh using the _clang-format file through all the cpp/.h files.
make sure not to apply it to certain serialization structures, since some parser expects the * as part of the name, instead of type.
This commit contains no other changes aside from adding and applying clang-format-all.sh
This commit is contained in:
erwincoumans
2018-09-23 14:17:31 -07:00
parent b73b05e9fb
commit ab8f16961e
1773 changed files with 1081087 additions and 474249 deletions

View File

@@ -12,33 +12,31 @@
class b3GpuBroadphaseInterface
{
public:
typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q);
typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuBroadphaseInterface()
{
}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
virtual void calculateOverlappingPairs(int maxPairs)=0;
virtual void calculateOverlappingPairsHost(int maxPairs)=0;
virtual void calculateOverlappingPairs(int maxPairs) = 0;
virtual void calculateOverlappingPairsHost(int maxPairs) = 0;
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu()=0;
virtual void writeAabbsToGpu() = 0;
virtual cl_mem getAabbBufferWS()=0;
virtual int getNumOverlap()=0;
virtual cl_mem getOverlappingPairBuffer()=0;
virtual cl_mem getAabbBufferWS() = 0;
virtual int getNumOverlap() = 0;
virtual cl_mem getOverlappingPairBuffer() = 0;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0;
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0;
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
};
#endif //B3_GPU_BROADPHASE_INTERFACE_H
#endif //B3_GPU_BROADPHASE_INTERFACE_H

View File

@@ -5,12 +5,9 @@
#include "kernels/sapKernels.h"
//#include "kernels/gridBroadphase.cl"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
@@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sap2Kernel;
//int maxPairsPerBody = 64;
int maxBodiesPerCell = 256;//??
int maxBodiesPerCell = 256; //??
b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
:m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU1(ctx,q),
m_smallAabbsMappingGPU(ctx,q),
m_largeAabbsMappingGPU(ctx,q),
m_gpuPairs(ctx,q),
b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
: m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU1(ctx, q),
m_smallAabbsMappingGPU(ctx, q),
m_largeAabbsMappingGPU(ctx, q),
m_gpuPairs(ctx, q),
m_hashGpu(ctx,q),
m_hashGpu(ctx, q),
m_cellStartGpu(ctx,q),
m_paramsGPU(ctx,q)
m_cellStartGpu(ctx, q),
m_paramsGPU(ctx, q)
{
b3Vector3 gridSize = b3MakeVector3(3,3,3);
b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]);
b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);
m_paramsCPU.m_gridSize[0] = 128;
m_paramsCPU.m_gridSize[1] = 128;
@@ -58,92 +49,79 @@ m_paramsGPU(ctx,q)
m_paramsCPU.m_invCellSize[3] = 0.f;
m_paramsGPU.push_back(m_paramsCPU);
cl_int errNum=0;
cl_int errNum = 0;
{
const char* sapSrc = sapCL;
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
b3Assert(errNum==CL_SUCCESS);
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
b3Assert(errNum == CL_SUCCESS);
m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
b3Assert(errNum == CL_SUCCESS);
}
{
cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH);
b3Assert(errNum==CL_SUCCESS);
cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
b3Assert(errNum == CL_SUCCESS);
kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
}
m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
}
b3GpuGridBroadphase::~b3GpuGridBroadphase()
{
clReleaseKernel( kCalcHashAABB);
clReleaseKernel( kClearCellStart);
clReleaseKernel( kFindCellStart);
clReleaseKernel( kFindOverlappingPairs);
clReleaseKernel( m_sap2Kernel);
clReleaseKernel( m_copyAabbsKernel);
clReleaseKernel(kCalcHashAABB);
clReleaseKernel(kClearCellStart);
clReleaseKernel(kFindCellStart);
clReleaseKernel(kFindOverlappingPairs);
clReleaseKernel(m_sap2Kernel);
clReleaseKernel(m_copyAabbsKernel);
delete m_sorter;
}
void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
}
void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
}
void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
{
B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
if (0)
{
calculateOverlappingPairsHost(maxPairs);
/*
/*
b3AlignedObjectArray<b3Int4> cpuPairs;
m_gpuPairs.copyToHost(cpuPairs);
printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
@@ -154,57 +132,50 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
*/
return;
}
int numSmallAabbs = m_smallAabbsMappingGPU.size();
b3OpenCLArray<int> pairCount(m_context,m_queue);
b3OpenCLArray<int> pairCount(m_context, m_queue);
pairCount.push_back(0);
m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody);
m_gpuPairs.resize(maxPairs); //numSmallAabbs*maxPairsPerBody);
{
int numLargeAabbs = m_largeAabbsMappingGPU.size();
if (numLargeAabbs && numSmallAabbs)
{
B3_PROFILE("sap2Kernel");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ),
b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ),
b3BufferInfoCL( m_gpuPairs.getBufferCL() ),
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
b3BufferInfoCL(m_gpuPairs.getBufferCL()),
b3BufferInfoCL(pairCount.getBufferCL())};
b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numLargeAabbs );
launcher.setConst( numSmallAabbs);
launcher.setConst( 0 );//axis is not used
launcher.setConst( maxPairs );
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbs);
launcher.setConst(numSmallAabbs);
launcher.setConst(0); //axis is not used
launcher.setConst(maxPairs);
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
int numPairs = pairCount.at(0);
if (numPairs >maxPairs)
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs =maxPairs;
numPairs = maxPairs;
}
}
}
if (numSmallAabbs)
{
B3_PROFILE("gridKernel");
m_hashGpu.resize(numSmallAabbs);
{
B3_PROFILE("kCalcHashAABB");
b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB");
b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
@@ -214,117 +185,104 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
}
m_sorter->execute(m_hashGpu);
int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2];
int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
m_cellStartGpu.resize(numCells);
//b3AlignedObjectArray<int > cellStartCpu;
{
B3_PROFILE("kClearCellStart");
b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart");
b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
launch.setConst(numCells);
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numCells);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
}
{
B3_PROFILE("kFindCellStart");
b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart");
b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numSmallAabbs);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
}
{
B3_PROFILE("kFindOverlappingPairs");
b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs");
b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.setBuffer(m_paramsGPU.getBufferCL());
//launch.setBuffer(0);
launch.setBuffer(pairCount.getBufferCL());
launch.setBuffer(m_gpuPairs.getBufferCL());
launch.setConst(maxPairs);
launch.launch1D(numSmallAabbs);
int numPairs = pairCount.at(0);
if (numPairs >maxPairs)
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs =maxPairs;
numPairs = maxPairs;
}
m_gpuPairs.resize(numPairs);
if (0)
{
b3AlignedObjectArray<b3Int4> pairsCpu;
m_gpuPairs.copyToHost(pairsCpu);
int sz = m_gpuPairs.size();
printf("m_gpuPairs.size()=%d\n",sz);
for (int i=0;i<m_gpuPairs.size();i++)
printf("m_gpuPairs.size()=%d\n", sz);
for (int i = 0; i < m_gpuPairs.size(); i++)
{
printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y);
printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
}
printf("?!?\n");
}
}
}
//calculateOverlappingPairsHost(maxPairs);
}
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
m_hostPairs.resize(0);
m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
for (int i=0;i<m_allAabbsCPU1.size();i++)
for (int i = 0; i < m_allAabbsCPU1.size(); i++)
{
for (int j=i+1;j<m_allAabbsCPU1.size();j++)
for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
{
if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec))
m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
{
b3Int4 pair;
int a = m_allAabbsCPU1[j].m_minIndices[3];
int b = m_allAabbsCPU1[i].m_minIndices[3];
if (a<=b)
if (a <= b)
{
pair.x = a;
pair.y = b;//store the original index in the unsorted aabb array
} else
pair.x = a;
pair.y = b; //store the original index in the unsorted aabb array
}
else
{
pair.x = b;
pair.y = a;//store the original index in the unsorted aabb array
pair.y = a; //store the original index in the unsorted aabb array
}
if (m_hostPairs.size()<maxPairs)
if (m_hostPairs.size() < maxPairs)
{
m_hostPairs.push_back(pair);
}
@@ -332,40 +290,36 @@ void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
}
}
m_gpuPairs.copyFromHost(m_hostPairs);
}
//call writeAabbsToGpu after done making all changes (createProxy etc)
//call writeAabbsToGpu after done making all changes (createProxy etc)
void b3GpuGridBroadphase::writeAabbsToGpu()
{
m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
}
cl_mem b3GpuGridBroadphase::getAabbBufferWS()
cl_mem b3GpuGridBroadphase::getAabbBufferWS()
{
return this->m_allAabbsGPU1.getBufferCL();
}
int b3GpuGridBroadphase::getNumOverlap()
int b3GpuGridBroadphase::getNumOverlap()
{
return m_gpuPairs.size();
}
cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
{
return m_gpuPairs.getBufferCL();
}
b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
{
return m_allAabbsGPU1;
}
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
{
return m_allAabbsCPU1;
}
@@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
{
return m_largeAabbsMappingGPU;
}

View File

@@ -6,83 +6,75 @@
struct b3ParamsGridBroadphaseCL
{
float m_invCellSize[4];
int m_gridSize[4];
int m_gridSize[4];
int getMaxBodiesPerCell() const
int getMaxBodiesPerCell() const
{
return m_gridSize[3];
}
void setMaxBodiesPerCell(int maxOverlap)
void setMaxBodiesPerCell(int maxOverlap)
{
m_gridSize[3] = maxOverlap;
}
};
class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
{
protected:
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
b3AlignedObjectArray<b3Int4> m_hostPairs;
b3OpenCLArray<b3Int4> m_gpuPairs;
b3OpenCLArray<b3Int4> m_gpuPairs;
b3OpenCLArray<b3SortData> m_hashGpu;
b3OpenCLArray<int> m_cellStartGpu;
b3OpenCLArray<b3SortData> m_hashGpu;
b3OpenCLArray<int> m_cellStartGpu;
b3ParamsGridBroadphaseCL m_paramsCPU;
b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
b3ParamsGridBroadphaseCL m_paramsCPU;
b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
class b3RadixSort32CL* m_sorter;
class b3RadixSort32CL* m_sorter;
public:
b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuGridBroadphase();
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuGridBroadphase(ctx,device,q);
return new b3GpuGridBroadphase(ctx, device, q);
}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
#endif //B3_GPU_GRID_BROADPHASE_H
#endif //B3_GPU_GRID_BROADPHASE_H

View File

@@ -16,177 +16,174 @@ subject to the following restrictions:
#include "b3GpuParallelLinearBvh.h"
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
m_queue(queue),
m_radixSorter(context, device, queue),
m_rootNodeIndex(context, queue),
m_maxDistanceFromRoot(context, queue),
m_temp(context, queue),
m_internalNodeAabbs(context, queue),
m_internalNodeLeafIndexRanges(context, queue),
m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue),
m_commonPrefixes(context, queue),
m_commonPrefixLengths(context, queue),
m_distanceFromRoot(context, queue),
m_leafNodeParentNodes(context, queue),
m_mortonCodesAndAabbIndicies(context, queue),
m_mergedAabb(context, queue),
m_leafNodeAabbs(context, queue),
m_largeAabbs(context, queue)
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
m_radixSorter(context, device, queue),
m_rootNodeIndex(context, queue),
m_maxDistanceFromRoot(context, queue),
m_temp(context, queue),
m_internalNodeAabbs(context, queue),
m_internalNodeLeafIndexRanges(context, queue),
m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue),
m_commonPrefixes(context, queue),
m_commonPrefixLengths(context, queue),
m_distanceFromRoot(context, queue),
m_leafNodeParentNodes(context, queue),
m_mortonCodesAndAabbIndicies(context, queue),
m_mergedAabb(context, queue),
m_leafNodeAabbs(context, queue),
m_largeAabbs(context, queue)
{
m_rootNodeIndex.resize(1);
m_maxDistanceFromRoot.resize(1);
m_temp.resize(1);
//
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
cl_int error;
char* additionalMacros = 0;
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
b3Assert(m_parallelLinearBvhProgram);
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_separateAabbsKernel);
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findAllNodesMergedAabbKernel);
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_computeAdjacentPairCommonPrefixKernel);
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findDistanceFromRootKernel);
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findLeafIndexRangesKernel);
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhRayTraverseKernel);
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbAabbTestKernel);
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbRayTestKernel);
}
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
{
clReleaseKernel(m_separateAabbsKernel);
clReleaseKernel(m_findAllNodesMergedAabbKernel);
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
clReleaseKernel(m_findDistanceFromRootKernel);
clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
clReleaseKernel(m_findLeafIndexRangesKernel);
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
clReleaseKernel(m_plbvhRayTraverseKernel);
clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
clReleaseProgram(m_parallelLinearBvhProgram);
}
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices)
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices)
{
B3_PROFILE("b3ParallelLinearBvh::build()");
int numLargeAabbs = largeAabbIndices.size();
int numSmallAabbs = smallAabbIndices.size();
//Since all AABBs(both large and small) are input as a contiguous array,
//Since all AABBs(both large and small) are input as a contiguous array,
//with 2 additional arrays used to indicate the indices of large and small AABBs,
//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
{
B3_PROFILE("Separate large and small AABBs");
m_largeAabbs.resize(numLargeAabbs);
m_leafNodeAabbs.resize(numSmallAabbs);
//Write large AABBs into m_largeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
b3BufferInfoCL( m_largeAabbs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
b3BufferInfoCL(largeAabbIndices.getBufferCL()),
b3BufferInfoCL(m_largeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbs);
launcher.launch1D(numLargeAabbs);
}
//Write small AABBs into m_leafNodeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
b3BufferInfoCL(smallAabbIndices.getBufferCL()),
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numSmallAabbs);
launcher.launch1D(numSmallAabbs);
}
clFinish(m_queue);
}
//
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
int numInternalNodes = numLeaves - 1;
if(numLeaves < 2)
if (numLeaves < 2)
{
//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
int rootNodeIndex = numLeaves - 1;
m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
if(numLeaves == 1)
if (numLeaves == 1)
{
b3SortData leaf;
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
m_mortonCodesAndAabbIndicies.resize(1);
m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
}
return;
}
//
{
m_internalNodeAabbs.resize(numInternalNodes);
@@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
m_commonPrefixes.resize(numInternalNodes);
m_commonPrefixLengths.resize(numInternalNodes);
m_distanceFromRoot.resize(numInternalNodes);
m_leafNodeParentNodes.resize(numLeaves);
m_mortonCodesAndAabbIndicies.resize(numLeaves);
m_mergedAabb.resize(numLeaves);
}
//Find the merged AABB of all small AABBs; this is used to define the size of
//Find the merged AABB of all small AABBs; this is used to define the size of
//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
{
B3_PROFILE("Find AABB of merged nodes");
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0]
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0]
};
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numAabbsNeedingMerge);
launcher.launch1D(numAabbsNeedingMerge);
}
clFinish(m_queue);
}
//Insert the center of the AABBs into a virtual grid,
//then convert the discrete grid coordinates into a morton code
//For each element in m_mortonCodesAndAabbIndicies, set
@@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
// m_value == small AABB index
{
B3_PROFILE("Assign morton codes");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_mergedAabb.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//
{
B3_PROFILE("Sort leaves by morton codes");
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
clFinish(m_queue);
}
//
constructBinaryRadixTree();
//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
//The child nodes of each node split their parent's index range into 2 contiguous halves.
@@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
{
B3_PROFILE("m_findLeafIndexRangesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
@@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou
{
int maxPairs = out_overlappingPairs.size();
b3OpenCLArray<int>& numPairsGpu = m_temp;
int reset = 0;
numPairsGpu.copyFromHostPointer(&reset, 1);
//
if( m_leafNodeAabbs.size() > 1 )
if (m_leafNodeAabbs.size() > 1)
{
B3_PROFILE("PLBVH small-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(numPairsGpu.getBufferCL()),
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH large-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
b3BufferInfoCL(numPairsGpu.getBufferCL()),
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numLargeAabbRigids);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
//
int numPairs = -1;
numPairsGpu.copyToHostPointer(&numPairs, 1);
if(numPairs > maxPairs)
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
numPairsGpu.copyFromHostPointer(&maxPairs, 1);
}
out_overlappingPairs.resize(numPairs);
}
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
{
B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
int numRays = rays.size();
int maxRayRigidPairs = out_rayRigidPairs.size();
int reset = 0;
out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
//
if( m_leafNodeAabbs.size() > 0 )
if (m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH ray test small AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( rays.getBufferCL() ),
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(rays.getBufferCL()),
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if(numLargeAabbRigids > 0)
if (numLargeAabbRigids > 0)
{
B3_PROFILE("PLBVH ray test large AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
b3BufferInfoCL( rays.getBufferCL() ),
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
b3BufferInfoCL(rays.getBufferCL()),
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbRigids);
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
//
int numRayRigidPairs = -1;
out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
if(numRayRigidPairs > maxRayRigidPairs)
if (numRayRigidPairs > maxRayRigidPairs)
b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
}
void b3GpuParallelLinearBvh::constructBinaryRadixTree()
{
B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
int numLeaves = m_leafNodeAabbs.size();
int numInternalNodes = numLeaves - 1;
//Each internal node is placed in between 2 leaf nodes.
//By using this arrangement and computing the common prefix between
//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
{
B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//For each leaf node, select its parent node by
//For each leaf node, select its parent node by
//comparing the 2 nearest internal nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//For each internal node, perform 2 binary searches among the other internal nodes
//to its left and right to find its potential parent nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Find the number of nodes seperating each internal node and the root node
//so that the AABBs can be set using the next kernel.
//Also determine the maximum number of nodes separating an internal node and the root node.
{
B3_PROFILE("m_findDistanceFromRootKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Starting from the internal nodes nearest to the leaf nodes, recursively move up
//the tree towards the root to set the AABBs of each internal node; each internal node
//checks its children and merges their AABBs
{
B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
int maxDistanceFromRoot = -1;
{
B3_PROFILE("copy maxDistanceFromRoot to CPU");
m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
clFinish(m_queue);
}
for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxDistanceFromRoot);
launcher.setConst(distanceFromRoot);
launcher.setConst(numInternalNodes);
//It may seem inefficent to launch a thread for each internal node when a
//much smaller number of nodes is actually processed, but this is actually
//faster than determining the exact nodes that are ready to merge their child AABBs.
//faster than determining the exact nodes that are ready to merge their child AABBs.
launcher.launch1D(numInternalNodes);
}
clFinish(m_queue);
}
}

View File

@@ -37,10 +37,10 @@ subject to the following restrictions:
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
///@par
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Sort morton codes
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
/// - [somewhat parallel] Set internal node AABBs
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
/// - [somewhat parallel] Set internal node AABBs
///@par
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
@@ -49,75 +49,75 @@ subject to the following restrictions:
class b3GpuParallelLinearBvh
{
cl_command_queue m_queue;
cl_program m_parallelLinearBvhProgram;
cl_kernel m_separateAabbsKernel;
cl_kernel m_findAllNodesMergedAabbKernel;
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
//Binary radix tree construction kernels
cl_kernel m_computeAdjacentPairCommonPrefixKernel;
cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
cl_kernel m_findDistanceFromRootKernel;
cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
cl_kernel m_findLeafIndexRangesKernel;
//Traversal kernels
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
cl_kernel m_plbvhRayTraverseKernel;
cl_kernel m_plbvhLargeAabbAabbTestKernel;
cl_kernel m_plbvhLargeAabbRayTestKernel;
b3RadixSort32CL m_radixSorter;
//1 element
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
//1 element per internal node; for binary radix tree construction
b3OpenCLArray<b3Int64> m_commonPrefixes;
b3OpenCLArray<int> m_commonPrefixLengths;
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
//1 element per leaf node (leaf nodes only include small AABBs)
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
//1 element per large AABB, which is not stored in the BVH
b3OpenCLArray<b3SapAabb> m_largeAabbs;
public:
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvh();
///Must be called before any other function
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices);
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices);
///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
private:
void constructBinaryRadixTree();
};

View File

@@ -13,45 +13,44 @@ subject to the following restrictions:
#include "b3GpuParallelLinearBvhBroadphase.h"
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :
m_plbvh(context, device, queue),
m_overlappingPairsGpu(context, queue),
m_aabbsGpu(context, queue),
m_smallAabbsMappingGpu(context, queue),
m_largeAabbsMappingGpu(context, queue)
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue),
m_overlappingPairsGpu(context, queue),
m_aabbsGpu(context, queue),
m_smallAabbsMappingGpu(context, queue),
m_largeAabbsMappingGpu(context, queue)
{
}
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_smallAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_largeAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
@@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
{
//Reconstruct BVH
m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
//
m_overlappingPairsGpu.resize(maxPairs);
m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
}
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
b3Assert(0); //CPU version not implemented
b3Assert(0); //CPU version not implemented
}
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
{
m_aabbsGpu.copyFromHost(m_aabbsCpu);
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
{
m_aabbsGpu.copyFromHost(m_aabbsCpu);
m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
}

View File

@@ -21,42 +21,42 @@ subject to the following restrictions:
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
{
b3GpuParallelLinearBvh m_plbvh;
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
b3OpenCLArray<int> m_smallAabbsMappingGpu;
b3OpenCLArray<int> m_largeAabbsMappingGpu;
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
public:
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvhBroadphase() {}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
{
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,7 @@
#define B3_GPU_SAP_BROADPHASE_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
class b3Vector3;
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
@@ -11,141 +11,133 @@ class b3Vector3;
#include "b3GpuBroadphaseInterface.h"
class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel ;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
cl_kernel m_prepareSumVarianceKernel;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
cl_kernel m_prepareSumVarianceKernel;
class b3RadixSort32CL* m_sorter;
///test for 3d SAP
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
b3OpenCLArray<int> m_addedCountGPU;
b3OpenCLArray<int> m_removedCountGPU;
b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
b3OpenCLArray<int> m_addedCountGPU;
b3OpenCLArray<int> m_removedCountGPU;
int m_currentBuffer;
int m_currentBuffer;
public:
b3OpenCLArray<int> m_pairCount;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
{
return m_allAabbsGPU;
}
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
{
return m_allAabbsCPU;
}
b3OpenCLArray<b3Vector3> m_sum;
b3OpenCLArray<b3Vector3> m_sum2;
b3OpenCLArray<b3Vector3> m_dst;
b3OpenCLArray<b3Vector3> m_sum;
b3OpenCLArray<b3Vector3> m_sum2;
b3OpenCLArray<b3Vector3> m_dst;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
b3OpenCLArray<b3Int4> m_overlappingPairs;
b3OpenCLArray<b3Int4> m_overlappingPairs;
//temporary gpu work memory
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
enum b3GpuSapKernelType
{
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1,
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,
B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
B3_GPU_SAP_KERNEL_ORIGINAL,
B3_GPU_SAP_KERNEL_BARRIER,
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
};
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
virtual ~b3GpuSapBroadphase();
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
}
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
}
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);
}
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);
}
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
}
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
void reset();
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
void reset();
void init3dSap();
virtual void calculateOverlappingPairsHostIncremental3Sap();
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
#endif //B3_GPU_SAP_BROADPHASE_H
#endif //B3_GPU_SAP_BROADPHASE_H

View File

@@ -5,10 +5,9 @@
#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
///just make sure that the b3Aabb is 16-byte aligned
B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb
{
B3_ATTRIBUTE_ALIGNED16(struct)
b3SapAabb : public b3Aabb{
};
};
#endif //B3_SAP_AABB_H
#endif //B3_SAP_AABB_H

View File

@@ -1,199 +1,198 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* gridBroadphaseCL= \
"int getPosHash(int4 gridPos, __global float4* pParams)\n"
"{\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x &= gridDim.x - 1;\n"
" gridPos.y &= gridDim.y - 1;\n"
" gridPos.z &= gridDim.z - 1;\n"
" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
" return hash;\n"
"} \n"
"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
"{\n"
" int4 gridPos;\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
" return gridPos;\n"
"}\n"
"// calculate grid hash value for each body using its AABB\n"
"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" pos.w = 0.f;\n"
" // get address in grid\n"
" int4 gridPos = getGridPos(pos, pParams);\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // store grid hash and body index\n"
" int2 hashVal;\n"
" hashVal.x = gridHash;\n"
" hashVal.y = index;\n"
" pHash[index] = hashVal;\n"
"}\n"
"__kernel void kClearCellStart( int numCells, \n"
" __global int* pCellStart )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numCells)\n"
" {\n"
" return;\n"
" }\n"
" pCellStart[index] = -1;\n"
"}\n"
"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
"{\n"
" __local int sharedHash[513];\n"
" int index = get_global_id(0);\n"
" int2 sortedData;\n"
" if(index < numObjects)\n"
" {\n"
" sortedData = pHash[index];\n"
" // Load hash data into shared memory so that we can look \n"
" // at neighboring body's hash value without loading\n"
" // two hash values per thread\n"
" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
" if((index > 0) && (get_local_id(0) == 0))\n"
" {\n"
" // first thread in block must load neighbor body hash\n"
" sharedHash[0] = pHash[index-1].x;\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(index < numObjects)\n"
" {\n"
" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
" {\n"
" cellStart[sortedData.x] = index;\n"
" }\n"
" }\n"
"}\n"
"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
"{\n"
" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
"}\n"
"//search for AABB 'index' against other AABBs' in this cell\n"
"void findPairsInCell( int numObjects,\n"
" int4 gridPos,\n"
" int index,\n"
" __global int2* pHash,\n"
" __global int* pCellStart,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global float4* pParams,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
" int maxBodiesPerCell = pGridDim.w;\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // get start of bucket for this cell\n"
" int bucketStart = pCellStart[gridHash];\n"
" if (bucketStart == -1)\n"
" {\n"
" return; // cell empty\n"
" }\n"
" // iterate over bodies in this cell\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" int handleIndex = as_int(min0.w);\n"
" \n"
" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
" {\n"
" int2 cellData = pHash[index2];\n"
" if (cellData.x != gridHash)\n"
" {\n"
" break; // no longer in same bucket\n"
" }\n"
" int unsorted_indx2 = cellData.y;\n"
" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
" { \n"
" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
" if(testAABBOverlap(min0, max0, min1, max1))\n"
" {\n"
" if (pairCount)\n"
" {\n"
" int handleIndex2 = as_int(min1.w);\n"
" if (handleIndex<handleIndex2)\n"
" {\n"
" int curPair = atomic_add(pairCount,1);\n"
" if (curPair<maxPairs)\n"
" {\n"
" int4 newpair;\n"
" newpair.x = handleIndex;\n"
" newpair.y = handleIndex2;\n"
" newpair.z = -1;\n"
" newpair.w = -1;\n"
" pPairBuff2[curPair] = newpair;\n"
" }\n"
" }\n"
" \n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void kFindOverlappingPairs( int numObjects,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global int2* pHash, \n"
" __global int* pCellStart, \n"
" __global float4* pParams ,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" // get address in grid\n"
" int4 gridPosA = getGridPos(pos, pParams);\n"
" int4 gridPosB; \n"
" // examine only neighbouring cells\n"
" for(int z=-1; z<=1; z++) \n"
" {\n"
" gridPosB.z = gridPosA.z + z;\n"
" for(int y=-1; y<=1; y++) \n"
" {\n"
" gridPosB.y = gridPosA.y + y;\n"
" for(int x=-1; x<=1; x++) \n"
" {\n"
" gridPosB.x = gridPosA.x + x;\n"
" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
" }\n"
" }\n"
" }\n"
"}\n"
;
static const char* gridBroadphaseCL =
"int getPosHash(int4 gridPos, __global float4* pParams)\n"
"{\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x &= gridDim.x - 1;\n"
" gridPos.y &= gridDim.y - 1;\n"
" gridPos.z &= gridDim.z - 1;\n"
" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
" return hash;\n"
"} \n"
"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
"{\n"
" int4 gridPos;\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
" return gridPos;\n"
"}\n"
"// calculate grid hash value for each body using its AABB\n"
"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" pos.w = 0.f;\n"
" // get address in grid\n"
" int4 gridPos = getGridPos(pos, pParams);\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // store grid hash and body index\n"
" int2 hashVal;\n"
" hashVal.x = gridHash;\n"
" hashVal.y = index;\n"
" pHash[index] = hashVal;\n"
"}\n"
"__kernel void kClearCellStart( int numCells, \n"
" __global int* pCellStart )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numCells)\n"
" {\n"
" return;\n"
" }\n"
" pCellStart[index] = -1;\n"
"}\n"
"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
"{\n"
" __local int sharedHash[513];\n"
" int index = get_global_id(0);\n"
" int2 sortedData;\n"
" if(index < numObjects)\n"
" {\n"
" sortedData = pHash[index];\n"
" // Load hash data into shared memory so that we can look \n"
" // at neighboring body's hash value without loading\n"
" // two hash values per thread\n"
" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
" if((index > 0) && (get_local_id(0) == 0))\n"
" {\n"
" // first thread in block must load neighbor body hash\n"
" sharedHash[0] = pHash[index-1].x;\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(index < numObjects)\n"
" {\n"
" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
" {\n"
" cellStart[sortedData.x] = index;\n"
" }\n"
" }\n"
"}\n"
"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
"{\n"
" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
"}\n"
"//search for AABB 'index' against other AABBs' in this cell\n"
"void findPairsInCell( int numObjects,\n"
" int4 gridPos,\n"
" int index,\n"
" __global int2* pHash,\n"
" __global int* pCellStart,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global float4* pParams,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
" int maxBodiesPerCell = pGridDim.w;\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // get start of bucket for this cell\n"
" int bucketStart = pCellStart[gridHash];\n"
" if (bucketStart == -1)\n"
" {\n"
" return; // cell empty\n"
" }\n"
" // iterate over bodies in this cell\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" int handleIndex = as_int(min0.w);\n"
" \n"
" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
" {\n"
" int2 cellData = pHash[index2];\n"
" if (cellData.x != gridHash)\n"
" {\n"
" break; // no longer in same bucket\n"
" }\n"
" int unsorted_indx2 = cellData.y;\n"
" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
" { \n"
" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
" if(testAABBOverlap(min0, max0, min1, max1))\n"
" {\n"
" if (pairCount)\n"
" {\n"
" int handleIndex2 = as_int(min1.w);\n"
" if (handleIndex<handleIndex2)\n"
" {\n"
" int curPair = atomic_add(pairCount,1);\n"
" if (curPair<maxPairs)\n"
" {\n"
" int4 newpair;\n"
" newpair.x = handleIndex;\n"
" newpair.y = handleIndex2;\n"
" newpair.z = -1;\n"
" newpair.w = -1;\n"
" pPairBuff2[curPair] = newpair;\n"
" }\n"
" }\n"
" \n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void kFindOverlappingPairs( int numObjects,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global int2* pHash, \n"
" __global int* pCellStart, \n"
" __global float4* pParams ,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" // get address in grid\n"
" int4 gridPosA = getGridPos(pos, pParams);\n"
" int4 gridPosB; \n"
" // examine only neighbouring cells\n"
" for(int z=-1; z<=1; z++) \n"
" {\n"
" gridPosB.z = gridPosA.z + z;\n"
" for(int y=-1; y<=1; y++) \n"
" {\n"
" gridPosB.y = gridPosA.y + y;\n"
" for(int x=-1; x<=1; x++) \n"
" {\n"
" gridPosB.x = gridPosA.x + x;\n"
" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
" }\n"
" }\n"
" }\n"
"}\n";

View File

@@ -1,342 +1,341 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#define NEW_PAIR_MARKER -1\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
" int j = get_global_id(1);\n"
" if (j>=numUnSortedAabbs2)\n"
" return;\n"
" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
" {\n"
" int4 myPair;\n"
" \n"
" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
" if (xIndex>yIndex)\n"
" {\n"
" int tmp = xIndex;\n"
" xIndex=yIndex;\n"
" yIndex=tmp;\n"
" }\n"
" \n"
" myPair.x = xIndex;\n"
" myPair.y = yIndex;\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" \n"
" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
"}\n"
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numAabbs)\n"
" return;\n"
" \n"
" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
" \n"
" float4 s;\n"
" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
" sum[i]=s;\n"
" sum2[i]=s*s; \n"
"}\n"
;
static const char* sapCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#define NEW_PAIR_MARKER -1\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
" int j = get_global_id(1);\n"
" if (j>=numUnSortedAabbs2)\n"
" return;\n"
" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
" {\n"
" int4 myPair;\n"
" \n"
" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
" if (xIndex>yIndex)\n"
" {\n"
" int tmp = xIndex;\n"
" xIndex=yIndex;\n"
" yIndex=tmp;\n"
" }\n"
" \n"
" myPair.x = xIndex;\n"
" myPair.y = yIndex;\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" \n"
" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
"}\n"
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numAabbs)\n"
" return;\n"
" \n"
" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
" \n"
" float4 s;\n"
" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
" sum[i]=s;\n"
" sum2[i]=s*s; \n"
"}\n";