Code-style consistency improvement:

Apply clang-format-all.sh using the _clang-format file through all the cpp/.h files.
make sure not to apply it to certain serialization structures, since some parser expects the * as part of the name, instead of type.
This commit contains no other changes aside from adding and applying clang-format-all.sh
This commit is contained in:
erwincoumans
2018-09-23 14:17:31 -07:00
parent b73b05e9fb
commit ab8f16961e
1773 changed files with 1081087 additions and 474249 deletions

View File

@@ -12,33 +12,31 @@
class b3GpuBroadphaseInterface
{
public:
typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue q);
typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuBroadphaseInterface()
{
}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
virtual void calculateOverlappingPairs(int maxPairs)=0;
virtual void calculateOverlappingPairsHost(int maxPairs)=0;
virtual void calculateOverlappingPairs(int maxPairs) = 0;
virtual void calculateOverlappingPairsHost(int maxPairs) = 0;
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu()=0;
virtual void writeAabbsToGpu() = 0;
virtual cl_mem getAabbBufferWS()=0;
virtual int getNumOverlap()=0;
virtual cl_mem getOverlappingPairBuffer()=0;
virtual cl_mem getAabbBufferWS() = 0;
virtual int getNumOverlap() = 0;
virtual cl_mem getOverlappingPairBuffer() = 0;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()=0;
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()=0;
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
};
#endif //B3_GPU_BROADPHASE_INTERFACE_H
#endif //B3_GPU_BROADPHASE_INTERFACE_H

View File

@@ -5,12 +5,9 @@
#include "kernels/sapKernels.h"
//#include "kernels/gridBroadphase.cl"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"
@@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sap2Kernel;
//int maxPairsPerBody = 64;
int maxBodiesPerCell = 256;//??
int maxBodiesPerCell = 256; //??
b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
:m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU1(ctx,q),
m_smallAabbsMappingGPU(ctx,q),
m_largeAabbsMappingGPU(ctx,q),
m_gpuPairs(ctx,q),
b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
: m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU1(ctx, q),
m_smallAabbsMappingGPU(ctx, q),
m_largeAabbsMappingGPU(ctx, q),
m_gpuPairs(ctx, q),
m_hashGpu(ctx,q),
m_hashGpu(ctx, q),
m_cellStartGpu(ctx,q),
m_paramsGPU(ctx,q)
m_cellStartGpu(ctx, q),
m_paramsGPU(ctx, q)
{
b3Vector3 gridSize = b3MakeVector3(3,3,3);
b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]);
b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);
m_paramsCPU.m_gridSize[0] = 128;
m_paramsCPU.m_gridSize[1] = 128;
@@ -58,92 +49,79 @@ m_paramsGPU(ctx,q)
m_paramsCPU.m_invCellSize[3] = 0.f;
m_paramsGPU.push_back(m_paramsCPU);
cl_int errNum=0;
cl_int errNum = 0;
{
const char* sapSrc = sapCL;
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
b3Assert(errNum==CL_SUCCESS);
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
b3Assert(errNum == CL_SUCCESS);
m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
b3Assert(errNum == CL_SUCCESS);
}
{
cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH);
b3Assert(errNum==CL_SUCCESS);
cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
b3Assert(errNum == CL_SUCCESS);
kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg);
b3Assert(errNum==CL_SUCCESS);
kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
b3Assert(errNum == CL_SUCCESS);
}
m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
}
b3GpuGridBroadphase::~b3GpuGridBroadphase()
{
clReleaseKernel( kCalcHashAABB);
clReleaseKernel( kClearCellStart);
clReleaseKernel( kFindCellStart);
clReleaseKernel( kFindOverlappingPairs);
clReleaseKernel( m_sap2Kernel);
clReleaseKernel( m_copyAabbsKernel);
clReleaseKernel(kCalcHashAABB);
clReleaseKernel(kClearCellStart);
clReleaseKernel(kFindCellStart);
clReleaseKernel(kFindOverlappingPairs);
clReleaseKernel(m_sap2Kernel);
clReleaseKernel(m_copyAabbsKernel);
delete m_sorter;
}
void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
}
void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size(); //NOT userPtr;
m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());
m_allAabbsCPU1.push_back(aabb);
}
void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
{
B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
if (0)
{
calculateOverlappingPairsHost(maxPairs);
/*
/*
b3AlignedObjectArray<b3Int4> cpuPairs;
m_gpuPairs.copyToHost(cpuPairs);
printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
@@ -154,57 +132,50 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
*/
return;
}
int numSmallAabbs = m_smallAabbsMappingGPU.size();
b3OpenCLArray<int> pairCount(m_context,m_queue);
b3OpenCLArray<int> pairCount(m_context, m_queue);
pairCount.push_back(0);
m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody);
m_gpuPairs.resize(maxPairs); //numSmallAabbs*maxPairsPerBody);
{
int numLargeAabbs = m_largeAabbsMappingGPU.size();
if (numLargeAabbs && numSmallAabbs)
{
B3_PROFILE("sap2Kernel");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ),
b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ),
b3BufferInfoCL( m_gpuPairs.getBufferCL() ),
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
b3BufferInfoCL(m_gpuPairs.getBufferCL()),
b3BufferInfoCL(pairCount.getBufferCL())};
b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numLargeAabbs );
launcher.setConst( numSmallAabbs);
launcher.setConst( 0 );//axis is not used
launcher.setConst( maxPairs );
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbs);
launcher.setConst(numSmallAabbs);
launcher.setConst(0); //axis is not used
launcher.setConst(maxPairs);
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
int numPairs = pairCount.at(0);
if (numPairs >maxPairs)
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs =maxPairs;
numPairs = maxPairs;
}
}
}
if (numSmallAabbs)
{
B3_PROFILE("gridKernel");
m_hashGpu.resize(numSmallAabbs);
{
B3_PROFILE("kCalcHashAABB");
b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB");
b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
@@ -214,117 +185,104 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
}
m_sorter->execute(m_hashGpu);
int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2];
int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
m_cellStartGpu.resize(numCells);
//b3AlignedObjectArray<int > cellStartCpu;
{
B3_PROFILE("kClearCellStart");
b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart");
b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
launch.setConst(numCells);
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numCells);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
}
{
B3_PROFILE("kFindCellStart");
b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart");
b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.launch1D(numSmallAabbs);
//m_cellStartGpu.copyToHost(cellStartCpu);
//printf("??\n");
}
{
B3_PROFILE("kFindOverlappingPairs");
b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs");
b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
launch.setConst(numSmallAabbs);
launch.setBuffer(m_allAabbsGPU1.getBufferCL());
launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
launch.setBuffer(m_hashGpu.getBufferCL());
launch.setBuffer(m_cellStartGpu.getBufferCL());
launch.setBuffer(m_paramsGPU.getBufferCL());
//launch.setBuffer(0);
launch.setBuffer(pairCount.getBufferCL());
launch.setBuffer(m_gpuPairs.getBufferCL());
launch.setConst(maxPairs);
launch.launch1D(numSmallAabbs);
int numPairs = pairCount.at(0);
if (numPairs >maxPairs)
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs =maxPairs;
numPairs = maxPairs;
}
m_gpuPairs.resize(numPairs);
if (0)
{
b3AlignedObjectArray<b3Int4> pairsCpu;
m_gpuPairs.copyToHost(pairsCpu);
int sz = m_gpuPairs.size();
printf("m_gpuPairs.size()=%d\n",sz);
for (int i=0;i<m_gpuPairs.size();i++)
printf("m_gpuPairs.size()=%d\n", sz);
for (int i = 0; i < m_gpuPairs.size(); i++)
{
printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y);
printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
}
printf("?!?\n");
}
}
}
//calculateOverlappingPairsHost(maxPairs);
}
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
m_hostPairs.resize(0);
m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
for (int i=0;i<m_allAabbsCPU1.size();i++)
for (int i = 0; i < m_allAabbsCPU1.size(); i++)
{
for (int j=i+1;j<m_allAabbsCPU1.size();j++)
for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
{
if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec))
m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
{
b3Int4 pair;
int a = m_allAabbsCPU1[j].m_minIndices[3];
int b = m_allAabbsCPU1[i].m_minIndices[3];
if (a<=b)
if (a <= b)
{
pair.x = a;
pair.y = b;//store the original index in the unsorted aabb array
} else
pair.x = a;
pair.y = b; //store the original index in the unsorted aabb array
}
else
{
pair.x = b;
pair.y = a;//store the original index in the unsorted aabb array
pair.y = a; //store the original index in the unsorted aabb array
}
if (m_hostPairs.size()<maxPairs)
if (m_hostPairs.size() < maxPairs)
{
m_hostPairs.push_back(pair);
}
@@ -332,40 +290,36 @@ void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
}
}
m_gpuPairs.copyFromHost(m_hostPairs);
}
//call writeAabbsToGpu after done making all changes (createProxy etc)
//call writeAabbsToGpu after done making all changes (createProxy etc)
void b3GpuGridBroadphase::writeAabbsToGpu()
{
m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
}
cl_mem b3GpuGridBroadphase::getAabbBufferWS()
cl_mem b3GpuGridBroadphase::getAabbBufferWS()
{
return this->m_allAabbsGPU1.getBufferCL();
}
int b3GpuGridBroadphase::getNumOverlap()
int b3GpuGridBroadphase::getNumOverlap()
{
return m_gpuPairs.size();
}
cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
{
return m_gpuPairs.getBufferCL();
}
b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
{
return m_allAabbsGPU1;
}
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
{
return m_allAabbsCPU1;
}
@@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
{
return m_largeAabbsMappingGPU;
}

View File

@@ -6,83 +6,75 @@
struct b3ParamsGridBroadphaseCL
{
float m_invCellSize[4];
int m_gridSize[4];
int m_gridSize[4];
int getMaxBodiesPerCell() const
int getMaxBodiesPerCell() const
{
return m_gridSize[3];
}
void setMaxBodiesPerCell(int maxOverlap)
void setMaxBodiesPerCell(int maxOverlap)
{
m_gridSize[3] = maxOverlap;
}
};
class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
{
protected:
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
b3AlignedObjectArray<b3Int4> m_hostPairs;
b3OpenCLArray<b3Int4> m_gpuPairs;
b3OpenCLArray<b3Int4> m_gpuPairs;
b3OpenCLArray<b3SortData> m_hashGpu;
b3OpenCLArray<int> m_cellStartGpu;
b3OpenCLArray<b3SortData> m_hashGpu;
b3OpenCLArray<int> m_cellStartGpu;
b3ParamsGridBroadphaseCL m_paramsCPU;
b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
b3ParamsGridBroadphaseCL m_paramsCPU;
b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;
class b3RadixSort32CL* m_sorter;
class b3RadixSort32CL* m_sorter;
public:
b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuGridBroadphase();
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuGridBroadphase(ctx,device,q);
return new b3GpuGridBroadphase(ctx, device, q);
}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
#endif //B3_GPU_GRID_BROADPHASE_H
#endif //B3_GPU_GRID_BROADPHASE_H

View File

@@ -16,177 +16,174 @@ subject to the following restrictions:
#include "b3GpuParallelLinearBvh.h"
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
m_queue(queue),
m_radixSorter(context, device, queue),
m_rootNodeIndex(context, queue),
m_maxDistanceFromRoot(context, queue),
m_temp(context, queue),
m_internalNodeAabbs(context, queue),
m_internalNodeLeafIndexRanges(context, queue),
m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue),
m_commonPrefixes(context, queue),
m_commonPrefixLengths(context, queue),
m_distanceFromRoot(context, queue),
m_leafNodeParentNodes(context, queue),
m_mortonCodesAndAabbIndicies(context, queue),
m_mergedAabb(context, queue),
m_leafNodeAabbs(context, queue),
m_largeAabbs(context, queue)
b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
m_radixSorter(context, device, queue),
m_rootNodeIndex(context, queue),
m_maxDistanceFromRoot(context, queue),
m_temp(context, queue),
m_internalNodeAabbs(context, queue),
m_internalNodeLeafIndexRanges(context, queue),
m_internalNodeChildNodes(context, queue),
m_internalNodeParentNodes(context, queue),
m_commonPrefixes(context, queue),
m_commonPrefixLengths(context, queue),
m_distanceFromRoot(context, queue),
m_leafNodeParentNodes(context, queue),
m_mortonCodesAndAabbIndicies(context, queue),
m_mergedAabb(context, queue),
m_leafNodeAabbs(context, queue),
m_largeAabbs(context, queue)
{
m_rootNodeIndex.resize(1);
m_maxDistanceFromRoot.resize(1);
m_temp.resize(1);
//
const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
const char* kernelSource = parallelLinearBvhCL; //parallelLinearBvhCL.h
cl_int error;
char* additionalMacros = 0;
m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
b3Assert(m_parallelLinearBvhProgram);
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_separateAabbsKernel);
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findAllNodesMergedAabbKernel);
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_computeAdjacentPairCommonPrefixKernel);
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findDistanceFromRootKernel);
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_findLeafIndexRangesKernel);
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhCalculateOverlappingPairsKernel);
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhRayTraverseKernel);
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbAabbTestKernel);
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
b3Assert(m_plbvhLargeAabbRayTestKernel);
}
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
{
clReleaseKernel(m_separateAabbsKernel);
clReleaseKernel(m_findAllNodesMergedAabbKernel);
clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
clReleaseKernel(m_findDistanceFromRootKernel);
clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
clReleaseKernel(m_findLeafIndexRangesKernel);
clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
clReleaseKernel(m_plbvhRayTraverseKernel);
clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
clReleaseProgram(m_parallelLinearBvhProgram);
}
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices)
void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices)
{
B3_PROFILE("b3ParallelLinearBvh::build()");
int numLargeAabbs = largeAabbIndices.size();
int numSmallAabbs = smallAabbIndices.size();
//Since all AABBs(both large and small) are input as a contiguous array,
//Since all AABBs(both large and small) are input as a contiguous array,
//with 2 additional arrays used to indicate the indices of large and small AABBs,
//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
{
B3_PROFILE("Separate large and small AABBs");
m_largeAabbs.resize(numLargeAabbs);
m_leafNodeAabbs.resize(numSmallAabbs);
//Write large AABBs into m_largeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
b3BufferInfoCL( m_largeAabbs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
b3BufferInfoCL(largeAabbIndices.getBufferCL()),
b3BufferInfoCL(m_largeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbs);
launcher.launch1D(numLargeAabbs);
}
//Write small AABBs into m_leafNodeAabbs
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
b3BufferInfoCL(smallAabbIndices.getBufferCL()),
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numSmallAabbs);
launcher.launch1D(numSmallAabbs);
}
clFinish(m_queue);
}
//
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
int numLeaves = numSmallAabbs; //Number of leaves in the BVH == Number of rigid bodies with small AABBs
int numInternalNodes = numLeaves - 1;
if(numLeaves < 2)
if (numLeaves < 2)
{
//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
int rootNodeIndex = numLeaves - 1;
m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
if(numLeaves == 1)
if (numLeaves == 1)
{
b3SortData leaf;
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
leaf.m_value = 0; //1 leaf so index is always 0; leaf.m_key does not need to be set
m_mortonCodesAndAabbIndicies.resize(1);
m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
}
return;
}
//
{
m_internalNodeAabbs.resize(numInternalNodes);
@@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
m_commonPrefixes.resize(numInternalNodes);
m_commonPrefixLengths.resize(numInternalNodes);
m_distanceFromRoot.resize(numInternalNodes);
m_leafNodeParentNodes.resize(numLeaves);
m_mortonCodesAndAabbIndicies.resize(numLeaves);
m_mergedAabb.resize(numLeaves);
}
//Find the merged AABB of all small AABBs; this is used to define the size of
//Find the merged AABB of all small AABBs; this is used to define the size of
//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
{
B3_PROFILE("Find AABB of merged nodes");
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs); //Need to make a copy since the kernel modifies the array
for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_mergedAabb.getBufferCL() ) //Resulting AABB is stored in m_mergedAabb[0]
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_mergedAabb.getBufferCL()) //Resulting AABB is stored in m_mergedAabb[0]
};
b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numAabbsNeedingMerge);
launcher.launch1D(numAabbsNeedingMerge);
}
clFinish(m_queue);
}
//Insert the center of the AABBs into a virtual grid,
//then convert the discrete grid coordinates into a morton code
//For each element in m_mortonCodesAndAabbIndicies, set
@@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
// m_value == small AABB index
{
B3_PROFILE("Assign morton codes");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_mergedAabb.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//
{
B3_PROFILE("Sort leaves by morton codes");
m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
clFinish(m_queue);
}
//
constructBinaryRadixTree();
//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
//The child nodes of each node split their parent's index range into 2 contiguous halves.
@@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
{
B3_PROFILE("m_findLeafIndexRangesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
@@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou
{
int maxPairs = out_overlappingPairs.size();
b3OpenCLArray<int>& numPairsGpu = m_temp;
int reset = 0;
numPairsGpu.copyFromHostPointer(&reset, 1);
//
if( m_leafNodeAabbs.size() > 1 )
if (m_leafNodeAabbs.size() > 1)
{
B3_PROFILE("PLBVH small-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(numPairsGpu.getBufferCL()),
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH large-small AABB test");
int numQueryAabbs = m_leafNodeAabbs.size();
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
b3BufferInfoCL( numPairsGpu.getBufferCL() ),
b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
b3BufferInfoCL(numPairsGpu.getBufferCL()),
b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxPairs);
launcher.setConst(numLargeAabbRigids);
launcher.setConst(numQueryAabbs);
launcher.launch1D(numQueryAabbs);
clFinish(m_queue);
}
//
int numPairs = -1;
numPairsGpu.copyToHostPointer(&numPairs, 1);
if(numPairs > maxPairs)
if (numPairs > maxPairs)
{
b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
numPairs = maxPairs;
numPairsGpu.copyFromHostPointer(&maxPairs, 1);
}
out_overlappingPairs.resize(numPairs);
}
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
{
B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
int numRays = rays.size();
int maxRayRigidPairs = out_rayRigidPairs.size();
int reset = 0;
out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
//
if( m_leafNodeAabbs.size() > 0 )
if (m_leafNodeAabbs.size() > 0)
{
B3_PROFILE("PLBVH ray test small AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( rays.getBufferCL() ),
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(rays.getBufferCL()),
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
int numLargeAabbRigids = m_largeAabbs.size();
if(numLargeAabbRigids > 0)
if (numLargeAabbRigids > 0)
{
B3_PROFILE("PLBVH ray test large AABB");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
b3BufferInfoCL( rays.getBufferCL() ),
b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_largeAabbs.getBufferCL()),
b3BufferInfoCL(rays.getBufferCL()),
b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLargeAabbRigids);
launcher.setConst(maxRayRigidPairs);
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_queue);
}
//
int numRayRigidPairs = -1;
out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
if(numRayRigidPairs > maxRayRigidPairs)
if (numRayRigidPairs > maxRayRigidPairs)
b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
}
void b3GpuParallelLinearBvh::constructBinaryRadixTree()
{
B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
int numLeaves = m_leafNodeAabbs.size();
int numInternalNodes = numLeaves - 1;
//Each internal node is placed in between 2 leaf nodes.
//By using this arrangement and computing the common prefix between
//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
{
B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//For each leaf node, select its parent node by
//For each leaf node, select its parent node by
//comparing the 2 nearest internal nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numLeaves);
launcher.launch1D(numLeaves);
clFinish(m_queue);
}
//For each internal node, perform 2 binary searches among the other internal nodes
//to its left and right to find its potential parent nodes and assign child node indices
{
B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Find the number of nodes seperating each internal node and the root node
//so that the AABBs can be set using the next kernel.
//Also determine the maximum number of nodes separating an internal node and the root node.
{
B3_PROFILE("m_findDistanceFromRootKernel");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numInternalNodes);
launcher.launch1D(numInternalNodes);
clFinish(m_queue);
}
//Starting from the internal nodes nearest to the leaf nodes, recursively move up
//the tree towards the root to set the AABBs of each internal node; each internal node
//checks its children and merges their AABBs
{
B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
int maxDistanceFromRoot = -1;
{
B3_PROFILE("copy maxDistanceFromRoot to CPU");
m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
clFinish(m_queue);
}
for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
{
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(maxDistanceFromRoot);
launcher.setConst(distanceFromRoot);
launcher.setConst(numInternalNodes);
//It may seem inefficent to launch a thread for each internal node when a
//much smaller number of nodes is actually processed, but this is actually
//faster than determining the exact nodes that are ready to merge their child AABBs.
//faster than determining the exact nodes that are ready to merge their child AABBs.
launcher.launch1D(numInternalNodes);
}
clFinish(m_queue);
}
}

View File

@@ -37,10 +37,10 @@ subject to the following restrictions:
///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
///@par
///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
/// - [fully parallel] Sort morton codes
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
/// - [somewhat parallel] Set internal node AABBs
/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
/// - [somewhat parallel] Set internal node AABBs
///@par
///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
@@ -49,75 +49,75 @@ subject to the following restrictions:
class b3GpuParallelLinearBvh
{
cl_command_queue m_queue;
cl_program m_parallelLinearBvhProgram;
cl_kernel m_separateAabbsKernel;
cl_kernel m_findAllNodesMergedAabbKernel;
cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
//Binary radix tree construction kernels
cl_kernel m_computeAdjacentPairCommonPrefixKernel;
cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
cl_kernel m_findDistanceFromRootKernel;
cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
cl_kernel m_findLeafIndexRangesKernel;
//Traversal kernels
cl_kernel m_plbvhCalculateOverlappingPairsKernel;
cl_kernel m_plbvhRayTraverseKernel;
cl_kernel m_plbvhLargeAabbAabbTestKernel;
cl_kernel m_plbvhLargeAabbRayTestKernel;
b3RadixSort32CL m_radixSorter;
//1 element
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
b3OpenCLArray<int> m_rootNodeIndex; //Most significant bit(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_maxDistanceFromRoot; //Max number of internal nodes between an internal node and the root node
b3OpenCLArray<int> m_temp; //Used to hold the number of pairs in calculateOverlappingPairs()
//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges; //x == min leaf index, y == max leaf index
b3OpenCLArray<b3Int2> m_internalNodeChildNodes; //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
b3OpenCLArray<int> m_internalNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
//1 element per internal node; for binary radix tree construction
b3OpenCLArray<b3Int64> m_commonPrefixes;
b3OpenCLArray<int> m_commonPrefixLengths;
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
b3OpenCLArray<int> m_distanceFromRoot; //Number of internal nodes between this node and the root
//1 element per leaf node (leaf nodes only include small AABBs)
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
b3OpenCLArray<int> m_leafNodeParentNodes; //For parent node index, msb(0x80000000) is not set since it is always internal
b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies; //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
b3OpenCLArray<b3SapAabb> m_mergedAabb; //m_mergedAabb[0] contains the merged AABB of all leaf nodes
b3OpenCLArray<b3SapAabb> m_leafNodeAabbs; //Contains only small AABBs
//1 element per large AABB, which is not stored in the BVH
b3OpenCLArray<b3SapAabb> m_largeAabbs;
public:
b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvh();
///Must be called before any other function
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices);
void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
const b3OpenCLArray<int>& largeAabbIndices);
///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
private:
void constructBinaryRadixTree();
};

View File

@@ -13,45 +13,44 @@ subject to the following restrictions:
#include "b3GpuParallelLinearBvhBroadphase.h"
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) :
m_plbvh(context, device, queue),
m_overlappingPairsGpu(context, queue),
m_aabbsGpu(context, queue),
m_smallAabbsMappingGpu(context, queue),
m_largeAabbsMappingGpu(context, queue)
b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue),
m_overlappingPairsGpu(context, queue),
m_aabbsGpu(context, queue),
m_smallAabbsMappingGpu(context, queue),
m_largeAabbsMappingGpu(context, queue)
{
}
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_smallAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
{
int newAabbIndex = m_aabbsCpu.size();
b3SapAabb aabb;
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = newAabbIndex;
m_largeAabbsMappingCpu.push_back(newAabbIndex);
m_aabbsCpu.push_back(aabb);
}
@@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
{
//Reconstruct BVH
m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
//
m_overlappingPairsGpu.resize(maxPairs);
m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
}
void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
b3Assert(0); //CPU version not implemented
b3Assert(0); //CPU version not implemented
}
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
{
m_aabbsGpu.copyFromHost(m_aabbsCpu);
void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
{
m_aabbsGpu.copyFromHost(m_aabbsCpu);
m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
}

View File

@@ -21,42 +21,42 @@ subject to the following restrictions:
class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
{
b3GpuParallelLinearBvh m_plbvh;
b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
b3OpenCLArray<b3SapAabb> m_aabbsGpu;
b3OpenCLArray<int> m_smallAabbsMappingGpu;
b3OpenCLArray<int> m_largeAabbsMappingGpu;
b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
public:
b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
virtual ~b3GpuParallelLinearBvhBroadphase() {}
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }
virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
{
return new b3GpuParallelLinearBvhBroadphase(context, device, queue);

File diff suppressed because it is too large Load Diff

View File

@@ -2,7 +2,7 @@
#define B3_GPU_SAP_BROADPHASE_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
class b3Vector3;
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
@@ -11,141 +11,133 @@ class b3Vector3;
#include "b3GpuBroadphaseInterface.h"
class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel ;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
cl_kernel m_prepareSumVarianceKernel;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
cl_kernel m_prepareSumVarianceKernel;
class b3RadixSort32CL* m_sorter;
///test for 3d SAP
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;
b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
b3OpenCLArray<int> m_addedCountGPU;
b3OpenCLArray<int> m_removedCountGPU;
b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
b3OpenCLArray<int> m_addedCountGPU;
b3OpenCLArray<int> m_removedCountGPU;
int m_currentBuffer;
int m_currentBuffer;
public:
b3OpenCLArray<int> m_pairCount;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
{
return m_allAabbsGPU;
}
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
{
return m_allAabbsCPU;
}
b3OpenCLArray<b3Vector3> m_sum;
b3OpenCLArray<b3Vector3> m_sum2;
b3OpenCLArray<b3Vector3> m_dst;
b3OpenCLArray<b3Vector3> m_sum;
b3OpenCLArray<b3Vector3> m_sum2;
b3OpenCLArray<b3Vector3> m_dst;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3OpenCLArray<int> m_smallAabbsMappingGPU;
b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3OpenCLArray<int> m_largeAabbsMappingGPU;
b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
b3OpenCLArray<b3Int4> m_overlappingPairs;
b3OpenCLArray<b3Int4> m_overlappingPairs;
//temporary gpu work memory
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
enum b3GpuSapKernelType
{
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1,
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,
B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
B3_GPU_SAP_KERNEL_ORIGINAL,
B3_GPU_SAP_KERNEL_BARRIER,
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
};
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
virtual ~b3GpuSapBroadphase();
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
}
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
}
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);
}
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);
}
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
}
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
void reset();
virtual void calculateOverlappingPairs(int maxPairs);
virtual void calculateOverlappingPairsHost(int maxPairs);
void reset();
void init3dSap();
virtual void calculateOverlappingPairsHostIncremental3Sap();
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
//call writeAabbsToGpu after done making all changes (createProxy etc)
virtual void writeAabbsToGpu();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual cl_mem getAabbBufferWS();
virtual int getNumOverlap();
virtual cl_mem getOverlappingPairBuffer();
virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
};
#endif //B3_GPU_SAP_BROADPHASE_H
#endif //B3_GPU_SAP_BROADPHASE_H

View File

@@ -5,10 +5,9 @@
#include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"
///just make sure that the b3Aabb is 16-byte aligned
B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb
{
B3_ATTRIBUTE_ALIGNED16(struct)
b3SapAabb : public b3Aabb{
};
};
#endif //B3_SAP_AABB_H
#endif //B3_SAP_AABB_H

View File

@@ -1,199 +1,198 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* gridBroadphaseCL= \
"int getPosHash(int4 gridPos, __global float4* pParams)\n"
"{\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x &= gridDim.x - 1;\n"
" gridPos.y &= gridDim.y - 1;\n"
" gridPos.z &= gridDim.z - 1;\n"
" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
" return hash;\n"
"} \n"
"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
"{\n"
" int4 gridPos;\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
" return gridPos;\n"
"}\n"
"// calculate grid hash value for each body using its AABB\n"
"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" pos.w = 0.f;\n"
" // get address in grid\n"
" int4 gridPos = getGridPos(pos, pParams);\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // store grid hash and body index\n"
" int2 hashVal;\n"
" hashVal.x = gridHash;\n"
" hashVal.y = index;\n"
" pHash[index] = hashVal;\n"
"}\n"
"__kernel void kClearCellStart( int numCells, \n"
" __global int* pCellStart )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numCells)\n"
" {\n"
" return;\n"
" }\n"
" pCellStart[index] = -1;\n"
"}\n"
"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
"{\n"
" __local int sharedHash[513];\n"
" int index = get_global_id(0);\n"
" int2 sortedData;\n"
" if(index < numObjects)\n"
" {\n"
" sortedData = pHash[index];\n"
" // Load hash data into shared memory so that we can look \n"
" // at neighboring body's hash value without loading\n"
" // two hash values per thread\n"
" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
" if((index > 0) && (get_local_id(0) == 0))\n"
" {\n"
" // first thread in block must load neighbor body hash\n"
" sharedHash[0] = pHash[index-1].x;\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(index < numObjects)\n"
" {\n"
" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
" {\n"
" cellStart[sortedData.x] = index;\n"
" }\n"
" }\n"
"}\n"
"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
"{\n"
" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
"}\n"
"//search for AABB 'index' against other AABBs' in this cell\n"
"void findPairsInCell( int numObjects,\n"
" int4 gridPos,\n"
" int index,\n"
" __global int2* pHash,\n"
" __global int* pCellStart,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global float4* pParams,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
" int maxBodiesPerCell = pGridDim.w;\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // get start of bucket for this cell\n"
" int bucketStart = pCellStart[gridHash];\n"
" if (bucketStart == -1)\n"
" {\n"
" return; // cell empty\n"
" }\n"
" // iterate over bodies in this cell\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" int handleIndex = as_int(min0.w);\n"
" \n"
" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
" {\n"
" int2 cellData = pHash[index2];\n"
" if (cellData.x != gridHash)\n"
" {\n"
" break; // no longer in same bucket\n"
" }\n"
" int unsorted_indx2 = cellData.y;\n"
" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
" { \n"
" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
" if(testAABBOverlap(min0, max0, min1, max1))\n"
" {\n"
" if (pairCount)\n"
" {\n"
" int handleIndex2 = as_int(min1.w);\n"
" if (handleIndex<handleIndex2)\n"
" {\n"
" int curPair = atomic_add(pairCount,1);\n"
" if (curPair<maxPairs)\n"
" {\n"
" int4 newpair;\n"
" newpair.x = handleIndex;\n"
" newpair.y = handleIndex2;\n"
" newpair.z = -1;\n"
" newpair.w = -1;\n"
" pPairBuff2[curPair] = newpair;\n"
" }\n"
" }\n"
" \n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void kFindOverlappingPairs( int numObjects,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global int2* pHash, \n"
" __global int* pCellStart, \n"
" __global float4* pParams ,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" // get address in grid\n"
" int4 gridPosA = getGridPos(pos, pParams);\n"
" int4 gridPosB; \n"
" // examine only neighbouring cells\n"
" for(int z=-1; z<=1; z++) \n"
" {\n"
" gridPosB.z = gridPosA.z + z;\n"
" for(int y=-1; y<=1; y++) \n"
" {\n"
" gridPosB.y = gridPosA.y + y;\n"
" for(int x=-1; x<=1; x++) \n"
" {\n"
" gridPosB.x = gridPosA.x + x;\n"
" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
" }\n"
" }\n"
" }\n"
"}\n"
;
static const char* gridBroadphaseCL =
"int getPosHash(int4 gridPos, __global float4* pParams)\n"
"{\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x &= gridDim.x - 1;\n"
" gridPos.y &= gridDim.y - 1;\n"
" gridPos.z &= gridDim.z - 1;\n"
" int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
" return hash;\n"
"} \n"
"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
"{\n"
" int4 gridPos;\n"
" int4 gridDim = *((__global int4*)(pParams + 1));\n"
" gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
" gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
" gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
" return gridPos;\n"
"}\n"
"// calculate grid hash value for each body using its AABB\n"
"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
" float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" pos.w = 0.f;\n"
" // get address in grid\n"
" int4 gridPos = getGridPos(pos, pParams);\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // store grid hash and body index\n"
" int2 hashVal;\n"
" hashVal.x = gridHash;\n"
" hashVal.y = index;\n"
" pHash[index] = hashVal;\n"
"}\n"
"__kernel void kClearCellStart( int numCells, \n"
" __global int* pCellStart )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numCells)\n"
" {\n"
" return;\n"
" }\n"
" pCellStart[index] = -1;\n"
"}\n"
"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
"{\n"
" __local int sharedHash[513];\n"
" int index = get_global_id(0);\n"
" int2 sortedData;\n"
" if(index < numObjects)\n"
" {\n"
" sortedData = pHash[index];\n"
" // Load hash data into shared memory so that we can look \n"
" // at neighboring body's hash value without loading\n"
" // two hash values per thread\n"
" sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
" if((index > 0) && (get_local_id(0) == 0))\n"
" {\n"
" // first thread in block must load neighbor body hash\n"
" sharedHash[0] = pHash[index-1].x;\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" if(index < numObjects)\n"
" {\n"
" if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
" {\n"
" cellStart[sortedData.x] = index;\n"
" }\n"
" }\n"
"}\n"
"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
"{\n"
" return (min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
" (min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
" (min0.z <= max1.z)&& (min1.z <= max0.z); \n"
"}\n"
"//search for AABB 'index' against other AABBs' in this cell\n"
"void findPairsInCell( int numObjects,\n"
" int4 gridPos,\n"
" int index,\n"
" __global int2* pHash,\n"
" __global int* pCellStart,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global float4* pParams,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int4 pGridDim = *((__global int4*)(pParams + 1));\n"
" int maxBodiesPerCell = pGridDim.w;\n"
" int gridHash = getPosHash(gridPos, pParams);\n"
" // get start of bucket for this cell\n"
" int bucketStart = pCellStart[gridHash];\n"
" if (bucketStart == -1)\n"
" {\n"
" return; // cell empty\n"
" }\n"
" // iterate over bodies in this cell\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
" float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" int handleIndex = as_int(min0.w);\n"
" \n"
" int bucketEnd = bucketStart + maxBodiesPerCell;\n"
" bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
" for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
" {\n"
" int2 cellData = pHash[index2];\n"
" if (cellData.x != gridHash)\n"
" {\n"
" break; // no longer in same bucket\n"
" }\n"
" int unsorted_indx2 = cellData.y;\n"
" //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
" if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
" { \n"
" float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
" float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
" if(testAABBOverlap(min0, max0, min1, max1))\n"
" {\n"
" if (pairCount)\n"
" {\n"
" int handleIndex2 = as_int(min1.w);\n"
" if (handleIndex<handleIndex2)\n"
" {\n"
" int curPair = atomic_add(pairCount,1);\n"
" if (curPair<maxPairs)\n"
" {\n"
" int4 newpair;\n"
" newpair.x = handleIndex;\n"
" newpair.y = handleIndex2;\n"
" newpair.z = -1;\n"
" newpair.w = -1;\n"
" pPairBuff2[curPair] = newpair;\n"
" }\n"
" }\n"
" \n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void kFindOverlappingPairs( int numObjects,\n"
" __global float4* allpAABB, \n"
" __global const int* smallAabbMapping,\n"
" __global int2* pHash, \n"
" __global int* pCellStart, \n"
" __global float4* pParams ,\n"
" volatile __global int* pairCount,\n"
" __global int4* pPairBuff2,\n"
" int maxPairs\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if(index >= numObjects)\n"
" {\n"
" return;\n"
" }\n"
" int2 sortedData = pHash[index];\n"
" int unsorted_indx = sortedData.y;\n"
" float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
" float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
" float4 pos;\n"
" pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
" pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
" pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
" // get address in grid\n"
" int4 gridPosA = getGridPos(pos, pParams);\n"
" int4 gridPosB; \n"
" // examine only neighbouring cells\n"
" for(int z=-1; z<=1; z++) \n"
" {\n"
" gridPosB.z = gridPosA.z + z;\n"
" for(int y=-1; y<=1; y++) \n"
" {\n"
" gridPosB.y = gridPosA.y + y;\n"
" for(int x=-1; x<=1; x++) \n"
" {\n"
" gridPosB.x = gridPosA.x + x;\n"
" findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
" }\n"
" }\n"
" }\n"
"}\n";

View File

@@ -1,342 +1,341 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#define NEW_PAIR_MARKER -1\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
" int j = get_global_id(1);\n"
" if (j>=numUnSortedAabbs2)\n"
" return;\n"
" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
" {\n"
" int4 myPair;\n"
" \n"
" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
" if (xIndex>yIndex)\n"
" {\n"
" int tmp = xIndex;\n"
" xIndex=yIndex;\n"
" yIndex=tmp;\n"
" }\n"
" \n"
" myPair.x = xIndex;\n"
" myPair.y = yIndex;\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" \n"
" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
"}\n"
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numAabbs)\n"
" return;\n"
" \n"
" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
" \n"
" float4 s;\n"
" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
" sum[i]=s;\n"
" sum2[i]=s*s; \n"
"}\n"
;
static const char* sapCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#define NEW_PAIR_MARKER -1\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
" int j = get_global_id(1);\n"
" if (j>=numUnSortedAabbs2)\n"
" return;\n"
" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
" __global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
" {\n"
" int4 myPair;\n"
" \n"
" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
" int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
" if (xIndex>yIndex)\n"
" {\n"
" int tmp = xIndex;\n"
" xIndex=yIndex;\n"
" yIndex=tmp;\n"
" }\n"
" \n"
" myPair.x = xIndex;\n"
" myPair.y = yIndex;\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" \n"
" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
"}\n"
"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numAabbs)\n"
" return;\n"
" \n"
" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
" \n"
" float4 s;\n"
" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
" sum[i]=s;\n"
" sum2[i]=s*s; \n"
"}\n";

View File

@@ -17,7 +17,7 @@ subject to the following restrictions:
#define B3_OPENCL_INCLUDE_H
#ifdef B3_USE_CLEW
#include "clew/clew.h"
#include "clew/clew.h"
#else
#ifdef __APPLE__
@@ -25,7 +25,7 @@ subject to the following restrictions:
#include <MiniCL/cl.h>
#else
#include <OpenCL/cl.h>
#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
#endif
#else
#ifdef USE_MINICL
@@ -34,15 +34,18 @@ subject to the following restrictions:
#include <CL/cl.h>
#ifdef _WIN32
#include "CL/cl_gl.h"
#endif //_WIN32
#endif //_WIN32
#endif
#endif //__APPLE__
#endif //B3_USE_CLEW
#endif //__APPLE__
#endif //B3_USE_CLEW
#include <assert.h>
#include <stdio.h>
#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
#endif //B3_OPENCL_INCLUDE_H
#define oclCHECKERROR(a, b) \
if ((a) != (b)) \
{ \
printf("OCL Error : %d\n", (a)); \
assert((a) == (b)); \
}
#endif //B3_OPENCL_INCLUDE_H

File diff suppressed because it is too large Load Diff

View File

@@ -22,42 +22,41 @@ subject to the following restrictions:
#include "b3OpenCLInclude.h"
#ifdef __cplusplus
extern "C" {
extern "C"
{
#endif
///C API for OpenCL utilities: convenience functions, see below for C++ API
///C API for OpenCL utilities: convenience functions, see below for C++ API
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* platformId);
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId);
int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);
int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);
cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);
cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);
void b3OpenCLUtils_printDeviceInfo(cl_device_id device);
void b3OpenCLUtils_printDeviceInfo(cl_device_id device);
cl_kernel b3OpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros);
cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros);
//optional
cl_program b3OpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros , const char* srcFileNameForCaching, bool disableBinaryCaching);
//optional
cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros, const char* srcFileNameForCaching, bool disableBinaryCaching);
//the following optional APIs provide access using specific platform information
int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);
//the following optional APIs provide access using specific platform information
int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);
void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);
const char* b3OpenCLUtils_getSdkVendorName();
const char* b3OpenCLUtils_getSdkVendorName();
///set the path (directory/folder) where the compiled OpenCL kernel are stored
void b3OpenCLUtils_setCachePath(const char* path);
cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex);
///set the path (directory/folder) where the compiled OpenCL kernel are stored
void b3OpenCLUtils_setCachePath(const char* path);
cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex);
#ifdef __cplusplus
}
@@ -71,37 +70,35 @@ typedef struct
char m_driverVersion[B3_MAX_STRING_LENGTH];
char m_deviceExtensions[B3_MAX_STRING_LENGTH];
cl_device_type m_deviceType;
cl_uint m_computeUnits;
size_t m_workitemDims;
size_t m_workItemSize[3];
size_t m_image2dMaxWidth;
size_t m_image2dMaxHeight;
size_t m_image3dMaxWidth;
size_t m_image3dMaxHeight;
size_t m_image3dMaxDepth;
size_t m_workgroupSize;
cl_uint m_clockFrequency;
cl_ulong m_constantBufferSize;
cl_ulong m_localMemSize;
cl_ulong m_globalMemSize;
cl_bool m_errorCorrectionSupport;
cl_device_type m_deviceType;
cl_uint m_computeUnits;
size_t m_workitemDims;
size_t m_workItemSize[3];
size_t m_image2dMaxWidth;
size_t m_image2dMaxHeight;
size_t m_image3dMaxWidth;
size_t m_image3dMaxHeight;
size_t m_image3dMaxDepth;
size_t m_workgroupSize;
cl_uint m_clockFrequency;
cl_ulong m_constantBufferSize;
cl_ulong m_localMemSize;
cl_ulong m_globalMemSize;
cl_bool m_errorCorrectionSupport;
cl_device_local_mem_type m_localMemType;
cl_uint m_maxReadImageArgs;
cl_uint m_maxWriteImageArgs;
cl_uint m_maxReadImageArgs;
cl_uint m_maxWriteImageArgs;
cl_uint m_addressBits;
cl_ulong m_maxMemAllocSize;
cl_uint m_addressBits;
cl_ulong m_maxMemAllocSize;
cl_command_queue_properties m_queueProperties;
cl_bool m_imageSupport;
cl_uint m_vecWidthChar;
cl_uint m_vecWidthShort;
cl_uint m_vecWidthInt;
cl_uint m_vecWidthLong;
cl_uint m_vecWidthFloat;
cl_uint m_vecWidthDouble;
cl_bool m_imageSupport;
cl_uint m_vecWidthChar;
cl_uint m_vecWidthShort;
cl_uint m_vecWidthInt;
cl_uint m_vecWidthLong;
cl_uint m_vecWidthFloat;
cl_uint m_vecWidthDouble;
} b3OpenCLDeviceInfo;
@@ -110,33 +107,32 @@ struct b3OpenCLPlatformInfo
char m_platformVendor[B3_MAX_STRING_LENGTH];
char m_platformName[B3_MAX_STRING_LENGTH];
char m_platformVersion[B3_MAX_STRING_LENGTH];
b3OpenCLPlatformInfo()
{
m_platformVendor[0]=0;
m_platformName[0]=0;
m_platformVersion[0]=0;
m_platformVendor[0] = 0;
m_platformName[0] = 0;
m_platformVersion[0] = 0;
}
};
///C++ API for OpenCL utilities: convenience functions
struct b3OpenCLUtils
{
/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0)
static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1, cl_platform_id* platformId = 0)
{
return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId);
return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex, platformId);
}
static inline int getNumDevices(cl_context cxMainContext)
{
return b3OpenCLUtils_getNumDevices(cxMainContext);
}
static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
{
return b3OpenCLUtils_getDevice(cxMainContext,nr);
return b3OpenCLUtils_getDevice(cxMainContext, nr);
}
static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info);
@@ -146,28 +142,28 @@ struct b3OpenCLUtils
b3OpenCLUtils_printDeviceInfo(device);
}
static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" )
static inline cl_kernel compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum = 0, cl_program prog = 0, const char* additionalMacros = "")
{
return b3OpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource, kernelName, pErrNum, prog,additionalMacros);
return b3OpenCLUtils_compileCLKernelFromString(clContext, device, kernelSource, kernelName, pErrNum, prog, additionalMacros);
}
//optional
static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0, bool disableBinaryCaching=false)
static inline cl_program compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum = 0, const char* additionalMacros = "", const char* srcFileNameForCaching = 0, bool disableBinaryCaching = false)
{
return b3OpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching, disableBinaryCaching);
return b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, srcFileNameForCaching, disableBinaryCaching);
}
//the following optional APIs provide access using specific platform information
static inline int getNumPlatforms(cl_int* pErrNum=0)
static inline int getNumPlatforms(cl_int* pErrNum = 0)
{
return b3OpenCLUtils_getNumPlatforms(pErrNum);
}
///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0)
static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum = 0)
{
return b3OpenCLUtils_getPlatform(nr,pErrNum);
return b3OpenCLUtils_getPlatform(nr, pErrNum);
}
static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo);
static inline void printPlatformInfo(cl_platform_id platform)
@@ -179,9 +175,9 @@ struct b3OpenCLUtils
{
return b3OpenCLUtils_getSdkVendorName();
}
static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1)
static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1)
{
return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex);
return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex);
}
static void setCachePath(const char* path)
{
@@ -189,6 +185,6 @@ struct b3OpenCLUtils
}
};
#endif //__cplusplus
#endif //__cplusplus
#endif // B3_OPENCL_UTILS_H
#endif // B3_OPENCL_UTILS_H

View File

@@ -5,14 +5,13 @@
struct b3BvhInfo
{
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
b3Vector3 m_quantization;
int m_numNodes;
int m_numSubTrees;
int m_nodeOffset;
int m_subTreeOffset;
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
b3Vector3 m_quantization;
int m_numNodes;
int m_numSubTrees;
int m_nodeOffset;
int m_subTreeOffset;
};
#endif //B3_BVH_INFO_H
#endif //B3_BVH_INFO_H

View File

@@ -15,7 +15,6 @@ subject to the following restrictions:
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3ContactCache.h"
#include "Bullet3Common/b3Transform.h"
@@ -69,7 +68,7 @@ int b3ContactCache::sortCachedPoints(const b3Vector3& pt)
maxPenetration = m_pointCache[i].getDistance();
}
}
#endif //KEEP_DEEPEST_POINT
#endif //KEEP_DEEPEST_POINT
b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.));
@@ -251,8 +250,4 @@ void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transfo
}
#endif

View File

@@ -17,17 +17,13 @@ subject to the following restrictions:
#ifndef B3_CONTACT_CACHE_H
#define B3_CONTACT_CACHE_H
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Common/b3AlignedAllocator.h"
///maximum contact breaking and merging threshold
extern b3Scalar gContactBreakingThreshold;
#define MANIFOLD_CACHE_SIZE 4
///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase.
@@ -37,24 +33,16 @@ extern b3Scalar gContactBreakingThreshold;
///reduces the cache to 4 points, when more then 4 points are added, using following rules:
///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
///note that some pairs of objects might have more then one contact manifold.
B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache
B3_ATTRIBUTE_ALIGNED16(class)
b3ContactCache
{
/// sort cached points so most isolated points come first
int sortCachedPoints(const b3Vector3& pt);
int sortCachedPoints(const b3Vector3& pt);
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
int addManifoldPoint( const b3Vector3& newPoint);
int addManifoldPoint(const b3Vector3& newPoint);
/*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex)
{
@@ -63,18 +51,12 @@ public:
}
*/
static bool validContactDistance(const b3Vector3& pt);
/// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin
static void refreshContactPoints( const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& newContactCache);
static void removeContactPoint(struct b3Contact4Data& newContactCache,int i);
static void refreshContactPoints(const b3Transform& trA, const b3Transform& trB, struct b3Contact4Data& newContactCache);
static void removeContactPoint(struct b3Contact4Data & newContactCache, int i);
};
#endif //B3_CONTACT_CACHE_H
#endif //B3_CONTACT_CACHE_H

File diff suppressed because it is too large Load Diff

View File

@@ -17,102 +17,90 @@
//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
struct GpuSatCollision
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_findSeparatingAxisKernel;
cl_kernel m_mprPenetrationKernel;
cl_kernel m_findSeparatingAxisUnitSphereKernel;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_findSeparatingAxisKernel;
cl_kernel m_mprPenetrationKernel;
cl_kernel m_findSeparatingAxisUnitSphereKernel;
cl_kernel m_findSeparatingAxisVertexFaceKernel;
cl_kernel m_findSeparatingAxisEdgeEdgeKernel;
cl_kernel m_findConcaveSeparatingAxisKernel;
cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel;
cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel;
cl_kernel m_findCompoundPairsKernel;
cl_kernel m_processCompoundPairsKernel;
cl_kernel m_clipHullHullKernel;
cl_kernel m_clipCompoundsHullHullKernel;
cl_kernel m_clipFacesAndFindContacts;
cl_kernel m_findClippingFacesKernel;
cl_kernel m_clipHullHullConcaveConvexKernel;
// cl_kernel m_extractManifoldAndAddContactKernel;
cl_kernel m_newContactReductionKernel;
cl_kernel m_findConcaveSeparatingAxisKernel;
cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel;
cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel;
cl_kernel m_bvhTraversalKernel;
cl_kernel m_primitiveContactsKernel;
cl_kernel m_findConcaveSphereContactsKernel;
cl_kernel m_findCompoundPairsKernel;
cl_kernel m_processCompoundPairsKernel;
cl_kernel m_clipHullHullKernel;
cl_kernel m_clipCompoundsHullHullKernel;
cl_kernel m_clipFacesAndFindContacts;
cl_kernel m_findClippingFacesKernel;
cl_kernel m_clipHullHullConcaveConvexKernel;
// cl_kernel m_extractManifoldAndAddContactKernel;
cl_kernel m_newContactReductionKernel;
cl_kernel m_bvhTraversalKernel;
cl_kernel m_primitiveContactsKernel;
cl_kernel m_findConcaveSphereContactsKernel;
cl_kernel m_processCompoundPairsPrimitivesKernel;
cl_kernel m_processCompoundPairsPrimitivesKernel;
b3OpenCLArray<b3Vector3> m_unitSphereDirections;
b3OpenCLArray<int> m_totalContactsOut;
b3OpenCLArray<int> m_totalContactsOut;
b3OpenCLArray<b3Vector3> m_sepNormals;
b3OpenCLArray<float> m_dmins;
b3OpenCLArray<int> m_hasSeparatingNormals;
b3OpenCLArray<int> m_hasSeparatingNormals;
b3OpenCLArray<b3Vector3> m_concaveSepNormals;
b3OpenCLArray<int> m_concaveHasSeparatingNormals;
b3OpenCLArray<int> m_numConcavePairsOut;
b3OpenCLArray<int> m_concaveHasSeparatingNormals;
b3OpenCLArray<int> m_numConcavePairsOut;
b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs;
b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals;
b3OpenCLArray<int> m_gpuHasCompoundSepNormals;
b3OpenCLArray<int> m_numCompoundPairsOut;
b3OpenCLArray<int> m_gpuHasCompoundSepNormals;
b3OpenCLArray<int> m_numCompoundPairsOut;
GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue q );
GpuSatCollision(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~GpuSatCollision();
void computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs,
const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
const b3OpenCLArray<b3Contact4>* oldContacts,
int maxContactCapacity,
int compoundPairCapacity,
const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
const b3OpenCLArray<b3Vector3>& vertices,
const b3OpenCLArray<b3Vector3>& uniqueEdges,
const b3OpenCLArray<b3GpuFace>& faces,
const b3OpenCLArray<int>& indices,
const b3OpenCLArray<b3Collidable>& gpuCollidables,
const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU,
b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU,
b3OpenCLArray<b3BvhInfo>* bvhInfo,
int numObjects,
int maxTriConvexPairCapacity,
b3OpenCLArray<b3Int4>& triangleConvexPairs,
int& numTriConvexPairsOut
);
void computeConvexConvexContactsGPUSAT(b3OpenCLArray<b3Int4>* pairs, int nPairs,
const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
const b3OpenCLArray<b3Contact4>* oldContacts,
int maxContactCapacity,
int compoundPairCapacity,
const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
const b3OpenCLArray<b3Vector3>& vertices,
const b3OpenCLArray<b3Vector3>& uniqueEdges,
const b3OpenCLArray<b3GpuFace>& faces,
const b3OpenCLArray<int>& indices,
const b3OpenCLArray<b3Collidable>& gpuCollidables,
const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU,
b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU,
b3OpenCLArray<b3BvhInfo>* bvhInfo,
int numObjects,
int maxTriConvexPairCapacity,
b3OpenCLArray<b3Int4>& triangleConvexPairs,
int& numTriConvexPairsOut);
};
#endif //_CONVEX_HULL_CONTACT_H
#endif //_CONVEX_HULL_CONTACT_H

View File

@@ -4,6 +4,4 @@
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
#endif //CONVEX_POLYHEDRON_CL
#endif //CONVEX_POLYHEDRON_CL

File diff suppressed because it is too large Load Diff

View File

@@ -29,40 +29,39 @@ GJK-EPA collision solver by Nathanael Presson, 2008
#include "Bullet3Common/b3Transform.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"
///btGjkEpaSolver contributed under zlib by Nathanael Presson
struct b3GjkEpaSolver2
struct b3GjkEpaSolver2
{
struct sResults
struct sResults
{
enum eStatus
enum eStatus
{
Separated, /* Shapes doesnt penetrate */
Penetrating, /* Shapes are penetrating */
GJK_Failed, /* GJK phase fail, no big issue, shapes are probably just 'touching' */
EPA_Failed /* EPA phase fail, bigger problem, need to save parameters, and debug */
} status;
b3Vector3 witnesses[2];
b3Vector3 normal;
b3Scalar distance;
Separated, /* Shapes doesnt penetrate */
Penetrating, /* Shapes are penetrating */
GJK_Failed, /* GJK phase fail, no big issue, shapes are probably just 'touching' */
EPA_Failed /* EPA phase fail, bigger problem, need to save parameters, and debug */
} status;
b3Vector3 witnesses[2];
b3Vector3 normal;
b3Scalar distance;
};
static int StackSizeRequirement();
static int StackSizeRequirement();
static bool Distance( const b3Transform& transA, const b3Transform& transB,
const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
const b3AlignedObjectArray<b3Vector3>& verticesA,
const b3AlignedObjectArray<b3Vector3>& verticesB,
const b3Vector3& guess,
sResults& results);
static bool Distance(const b3Transform& transA, const b3Transform& transB,
const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
const b3AlignedObjectArray<b3Vector3>& verticesA,
const b3AlignedObjectArray<b3Vector3>& verticesB,
const b3Vector3& guess,
sResults& results);
static bool Penetration( const b3Transform& transA, const b3Transform& transB,
const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
static bool Penetration(const b3Transform& transA, const b3Transform& transB,
const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
const b3AlignedObjectArray<b3Vector3>& verticesA,
const b3AlignedObjectArray<b3Vector3>& verticesB,
const b3Vector3& guess,
sResults& results,
bool usemargins=true);
bool usemargins = true);
#if 0
static b3Scalar SignedDistance( const b3Vector3& position,
b3Scalar margin,
@@ -74,9 +73,7 @@ static bool SignedDistance( const btConvexShape* shape0,const btTransform& wtrs
const btConvexShape* shape1,const btTransform& wtrs1,
const b3Vector3& guess,
sResults& results);
#endif
#endif
};
#endif //B3_GJK_EPA2_H
#endif //B3_GJK_EPA2_H

View File

@@ -13,50 +13,45 @@ subject to the following restrictions:
3. This notice may not be removed or altered from any source distribution.
*/
#include "b3OptimizedBvh.h"
#include "b3StridingMeshInterface.h"
#include "Bullet3Geometry/b3AabbUtil.h"
b3OptimizedBvh::b3OptimizedBvh()
{
{
}
b3OptimizedBvh::~b3OptimizedBvh()
{
}
void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
{
m_useQuantization = useQuantizedAabbCompression;
// NodeArray triangleNodes;
struct NodeTriangleCallback : public b3InternalTriangleIndexCallback
struct NodeTriangleCallback : public b3InternalTriangleIndexCallback
{
NodeArray& m_triangleNodes;
NodeArray& m_triangleNodes;
NodeTriangleCallback& operator=(NodeTriangleCallback& other)
{
m_triangleNodes.copyFromArray(other.m_triangleNodes);
return *this;
}
NodeTriangleCallback(NodeArray& triangleNodes)
:m_triangleNodes(triangleNodes)
NodeTriangleCallback(NodeArray& triangleNodes)
: m_triangleNodes(triangleNodes)
{
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex)
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
{
b3OptimizedBvhNode node;
b3Vector3 aabbMin,aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
b3Vector3 aabbMin, aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangle[0]);
aabbMax.setMax(triangle[0]);
aabbMin.setMin(triangle[1]);
@@ -69,17 +64,17 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
node.m_aabbMaxOrg = aabbMax;
node.m_escapeIndex = -1;
//for child nodes
node.m_subPart = partId;
node.m_triangleIndex = triangleIndex;
m_triangleNodes.push_back(node);
}
};
struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
{
QuantizedNodeArray& m_triangleNodes;
const b3QuantizedBvh* m_optimizedTree; // for quantization
QuantizedNodeArray& m_triangleNodes;
const b3QuantizedBvh* m_optimizedTree; // for quantization
QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
{
@@ -88,23 +83,23 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
return *this;
}
QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes,const b3QuantizedBvh* tree)
:m_triangleNodes(triangleNodes),m_optimizedTree(tree)
QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes, const b3QuantizedBvh* tree)
: m_triangleNodes(triangleNodes), m_optimizedTree(tree)
{
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex)
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
{
// The partId and triangle index must fit in the same (positive) integer
b3Assert(partId < (1<<MAX_NUM_PARTS_IN_BITS));
b3Assert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS)));
b3Assert(partId < (1 << MAX_NUM_PARTS_IN_BITS));
b3Assert(triangleIndex < (1 << (31 - MAX_NUM_PARTS_IN_BITS)));
//negative indices are reserved for escapeIndex
b3Assert(triangleIndex>=0);
b3Assert(triangleIndex >= 0);
b3QuantizedBvhNode node;
b3Vector3 aabbMin,aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
b3Vector3 aabbMin, aabbMax;
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangle[0]);
aabbMax.setMax(triangle[0]);
aabbMin.setMin(triangle[1]);
@@ -131,59 +126,52 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION);
}
m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0);
m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1);
m_optimizedTree->quantize(&node.m_quantizedAabbMin[0], aabbMin, 0);
m_optimizedTree->quantize(&node.m_quantizedAabbMax[0], aabbMax, 1);
node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
node.m_escapeIndexOrTriangleIndex = (partId << (31 - MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
m_triangleNodes.push_back(node);
}
};
int numLeafNodes = 0;
if (m_useQuantization)
{
//initialize quantization values
setQuantizationValues(bvhAabbMin,bvhAabbMax);
setQuantizationValues(bvhAabbMin, bvhAabbMax);
QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes,this);
QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes, this);
triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax);
triangles->InternalProcessAllTriangles(&callback, m_bvhAabbMin, m_bvhAabbMax);
//now we have an array of leafnodes in m_leafNodes
numLeafNodes = m_quantizedLeafNodes.size();
m_quantizedContiguousNodes.resize(2*numLeafNodes);
} else
m_quantizedContiguousNodes.resize(2 * numLeafNodes);
}
else
{
NodeTriangleCallback callback(m_leafNodes);
NodeTriangleCallback callback(m_leafNodes);
b3Vector3 aabbMin=b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
b3Vector3 aabbMax=b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
b3Vector3 aabbMin = b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
b3Vector3 aabbMax = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax);
triangles->InternalProcessAllTriangles(&callback, aabbMin, aabbMax);
//now we have an array of leafnodes in m_leafNodes
numLeafNodes = m_leafNodes.size();
m_contiguousNodes.resize(2*numLeafNodes);
m_contiguousNodes.resize(2 * numLeafNodes);
}
m_curNodeIndex = 0;
buildTree(0,numLeafNodes);
buildTree(0, numLeafNodes);
///if the entire tree is small then subtree size, we need to create a header info for the tree
if(m_useQuantization && !m_SubtreeHeaders.size())
if (m_useQuantization && !m_SubtreeHeaders.size())
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
@@ -199,37 +187,29 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
m_leafNodes.clear();
}
void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
{
if (m_useQuantization)
{
setQuantizationValues(aabbMin, aabbMax);
setQuantizationValues(aabbMin,aabbMax);
updateBvhNodes(meshInterface,0,m_curNodeIndex,0);
updateBvhNodes(meshInterface, 0, m_curNodeIndex, 0);
///now update all subtree headers
int i;
for (i=0;i<m_SubtreeHeaders.size();i++)
for (i = 0; i < m_SubtreeHeaders.size(); i++)
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
}
} else
}
else
{
}
}
void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
{
//incrementally initialize quantization values
b3Assert(m_useQuantization);
@@ -244,147 +224,135 @@ void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b
///we should update all quantization values, using updateBvhNodes(meshInterface);
///but we only update chunks that overlap the given aabb
unsigned short quantizedQueryAabbMin[3];
unsigned short quantizedQueryAabbMax[3];
quantize(&quantizedQueryAabbMin[0],aabbMin,0);
quantize(&quantizedQueryAabbMax[0],aabbMax,1);
unsigned short quantizedQueryAabbMin[3];
unsigned short quantizedQueryAabbMax[3];
quantize(&quantizedQueryAabbMin[0], aabbMin, 0);
quantize(&quantizedQueryAabbMax[0], aabbMax, 1);
int i;
for (i=0;i<this->m_SubtreeHeaders.size();i++)
for (i = 0; i < this->m_SubtreeHeaders.size(); i++)
{
b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
//PCK: unsigned instead of bool
unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, subtree.m_quantizedAabbMin, subtree.m_quantizedAabbMax);
if (overlap != 0)
{
updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i);
updateBvhNodes(meshInterface, subtree.m_rootNodeIndex, subtree.m_rootNodeIndex + subtree.m_subtreeSize, i);
subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
}
}
}
void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index)
void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface, int firstNode, int endNode, int index)
{
(void)index;
b3Assert(m_useQuantization);
int curNodeSubPart=-1;
int curNodeSubPart = -1;
//get access info to trianglemesh data
const unsigned char *vertexbase = 0;
int numverts = 0;
PHY_ScalarType type = PHY_INTEGER;
int stride = 0;
const unsigned char *indexbase = 0;
int indexstride = 0;
int numfaces = 0;
PHY_ScalarType indicestype = PHY_INTEGER;
const unsigned char* vertexbase = 0;
int numverts = 0;
PHY_ScalarType type = PHY_INTEGER;
int stride = 0;
const unsigned char* indexbase = 0;
int indexstride = 0;
int numfaces = 0;
PHY_ScalarType indicestype = PHY_INTEGER;
b3Vector3 triangleVerts[3];
b3Vector3 aabbMin,aabbMax;
const b3Vector3& meshScaling = meshInterface->getScaling();
int i;
for (i=endNode-1;i>=firstNode;i--)
b3Vector3 triangleVerts[3];
b3Vector3 aabbMin, aabbMax;
const b3Vector3& meshScaling = meshInterface->getScaling();
int i;
for (i = endNode - 1; i >= firstNode; i--)
{
b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
if (curNode.isLeafNode())
{
b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
if (curNode.isLeafNode())
//recalc aabb from triangle data
int nodeSubPart = curNode.getPartId();
int nodeTriangleIndex = curNode.getTriangleIndex();
if (nodeSubPart != curNodeSubPart)
{
//recalc aabb from triangle data
int nodeSubPart = curNode.getPartId();
int nodeTriangleIndex = curNode.getTriangleIndex();
if (nodeSubPart != curNodeSubPart)
{
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts, type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart);
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numfaces, indicestype, nodeSubPart);
curNodeSubPart = nodeSubPart;
b3Assert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT);
}
//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
curNodeSubPart = nodeSubPart;
b3Assert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
}
//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride);
for (int j=2;j>=0;j--)
{
int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j];
if (type == PHY_FLOAT)
{
float* graphicsbase = (float*)(vertexbase+graphicsindex*stride);
triangleVerts[j] = b3MakeVector3(
graphicsbase[0]*meshScaling.getX(),
graphicsbase[1]*meshScaling.getY(),
graphicsbase[2]*meshScaling.getZ());
}
else
{
double* graphicsbase = (double*)(vertexbase+graphicsindex*stride);
triangleVerts[j] = b3MakeVector3( b3Scalar(graphicsbase[0]*meshScaling.getX()), b3Scalar(graphicsbase[1]*meshScaling.getY()), b3Scalar(graphicsbase[2]*meshScaling.getZ()));
}
}
unsigned int* gfxbase = (unsigned int*)(indexbase + nodeTriangleIndex * indexstride);
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangleVerts[0]);
aabbMax.setMax(triangleVerts[0]);
aabbMin.setMin(triangleVerts[1]);
aabbMax.setMax(triangleVerts[1]);
aabbMin.setMin(triangleVerts[2]);
aabbMax.setMax(triangleVerts[2]);
quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0);
quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1);
} else
for (int j = 2; j >= 0; j--)
{
//combine aabb from both children
b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1];
b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] :
&m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()];
int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
if (type == PHY_FLOAT)
{
for (int i=0;i<3;i++)
{
curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i])
curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i];
curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
}
float* graphicsbase = (float*)(vertexbase + graphicsindex * stride);
triangleVerts[j] = b3MakeVector3(
graphicsbase[0] * meshScaling.getX(),
graphicsbase[1] * meshScaling.getY(),
graphicsbase[2] * meshScaling.getZ());
}
else
{
double* graphicsbase = (double*)(vertexbase + graphicsindex * stride);
triangleVerts[j] = b3MakeVector3(b3Scalar(graphicsbase[0] * meshScaling.getX()), b3Scalar(graphicsbase[1] * meshScaling.getY()), b3Scalar(graphicsbase[2] * meshScaling.getZ()));
}
}
aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMin.setMin(triangleVerts[0]);
aabbMax.setMax(triangleVerts[0]);
aabbMin.setMin(triangleVerts[1]);
aabbMax.setMax(triangleVerts[1]);
aabbMin.setMin(triangleVerts[2]);
aabbMax.setMax(triangleVerts[2]);
quantize(&curNode.m_quantizedAabbMin[0], aabbMin, 0);
quantize(&curNode.m_quantizedAabbMax[0], aabbMax, 1);
}
else
{
//combine aabb from both children
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i + 1];
b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i + 2] : &m_quantizedContiguousNodes[i + 1 + leftChildNode->getEscapeIndex()];
{
for (int i = 0; i < 3; i++)
{
curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
if (curNode.m_quantizedAabbMin[i] > rightChildNode->m_quantizedAabbMin[i])
curNode.m_quantizedAabbMin[i] = rightChildNode->m_quantizedAabbMin[i];
curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
}
}
}
}
if (curNodeSubPart >= 0)
meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
}
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
{
b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
//we don't add additional data so just do a static upcast
return static_cast<b3OptimizedBvh*>(bvh);
}

View File

@@ -22,44 +22,35 @@ subject to the following restrictions:
class b3StridingMeshInterface;
///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface.
B3_ATTRIBUTE_ALIGNED16(class) b3OptimizedBvh : public b3QuantizedBvh
B3_ATTRIBUTE_ALIGNED16(class)
b3OptimizedBvh : public b3QuantizedBvh
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
protected:
public:
b3OptimizedBvh();
virtual ~b3OptimizedBvh();
void build(b3StridingMeshInterface* triangles,bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
void build(b3StridingMeshInterface * triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
void refit(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin,const b3Vector3& aabbMax);
void refit(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);
void refitPartial(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin, const b3Vector3& aabbMax);
void refitPartial(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);
void updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index);
void updateBvhNodes(b3StridingMeshInterface * meshInterface, int firstNode, int endNode, int index);
/// Data buffer MUST be 16 byte aligned
virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
virtual bool serializeInPlace(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
{
return b3QuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
return b3QuantizedBvh::serialize(o_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
}
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
static b3OptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
static b3OptimizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
};
#endif //B3_OPTIMIZED_BVH_H
#endif //B3_OPTIMIZED_BVH_H

File diff suppressed because it is too large Load Diff

View File

@@ -22,11 +22,11 @@ class b3Serializer;
#ifdef DEBUG_CHECK_DEQUANTIZATION
#ifdef __SPU__
#define printf spu_printf
#endif //__SPU__
#endif //__SPU__
#include <stdio.h>
#include <stdlib.h>
#endif //DEBUG_CHECK_DEQUANTIZATION
#endif //DEBUG_CHECK_DEQUANTIZATION
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3AlignedAllocator.h"
@@ -44,13 +44,10 @@ class b3Serializer;
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"
//http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp
//Note: currently we have 16 bytes per quantized node
#define MAX_SUBTREE_SIZE_IN_BYTES 2048
#define MAX_SUBTREE_SIZE_IN_BYTES 2048
// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
// actually) triangles each (since the sign bit is reserved
@@ -58,7 +55,8 @@ class b3Serializer;
///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
B3_ATTRIBUTE_ALIGNED16 (struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeData
B3_ATTRIBUTE_ALIGNED16(struct)
b3QuantizedBvhNode : public b3QuantizedBvhNodeData
{
B3_DECLARE_ALIGNED_ALLOCATOR();
@@ -72,48 +70,48 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeDa
b3Assert(!isLeafNode());
return -m_escapeIndexOrTriangleIndex;
}
int getTriangleIndex() const
int getTriangleIndex() const
{
b3Assert(isLeafNode());
unsigned int x=0;
unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
unsigned int x = 0;
unsigned int y = (~(x & 0)) << (31 - MAX_NUM_PARTS_IN_BITS);
// Get only the lower bits where the triangle index is stored
return (m_escapeIndexOrTriangleIndex&~(y));
return (m_escapeIndexOrTriangleIndex & ~(y));
}
int getPartId() const
int getPartId() const
{
b3Assert(isLeafNode());
// Get only the highest bits where the part index is stored
return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));
return (m_escapeIndexOrTriangleIndex >> (31 - MAX_NUM_PARTS_IN_BITS));
}
}
;
};
/// b3OptimizedBvhNode contains both internal and leaf node information.
/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
B3_ATTRIBUTE_ALIGNED16 (struct) b3OptimizedBvhNode
B3_ATTRIBUTE_ALIGNED16(struct)
b3OptimizedBvhNode
{
B3_DECLARE_ALIGNED_ALLOCATOR();
//32 bytes
b3Vector3 m_aabbMinOrg;
b3Vector3 m_aabbMaxOrg;
b3Vector3 m_aabbMinOrg;
b3Vector3 m_aabbMaxOrg;
//4
int m_escapeIndex;
int m_escapeIndex;
//8
//for child nodes
int m_subPart;
int m_triangleIndex;
int m_subPart;
int m_triangleIndex;
//pad the size to 64 bytes
char m_padding[20];
//pad the size to 64 bytes
char m_padding[20];
};
///b3BvhSubtreeInfo provides info to gather a subtree of limited size
B3_ATTRIBUTE_ALIGNED16(class) b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
B3_ATTRIBUTE_ALIGNED16(class)
b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
@@ -123,8 +121,7 @@ public:
//memset(&m_padding[0], 0, sizeof(m_padding));
}
void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
{
m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
@@ -133,14 +130,12 @@ public:
m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
}
}
;
};
class b3NodeOverlapCallback
{
public:
virtual ~b3NodeOverlapCallback() {};
virtual ~b3NodeOverlapCallback(){};
virtual void processNode(int subPart, int triangleIndex) = 0;
};
@@ -148,18 +143,16 @@ public:
#include "Bullet3Common/b3AlignedAllocator.h"
#include "Bullet3Common/b3AlignedObjectArray.h"
///for code readability:
typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray;
typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray;
typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray;
typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray;
typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray;
typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray;
///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
///It is used by the b3BvhTriangleMeshShape as midphase
///It is recommended to use quantization for better performance and lower memory requirements.
B3_ATTRIBUTE_ALIGNED16(class) b3QuantizedBvh
B3_ATTRIBUTE_ALIGNED16(class)
b3QuantizedBvh
{
public:
enum b3TraversalMode
@@ -169,56 +162,48 @@ public:
TRAVERSAL_RECURSIVE
};
b3Vector3 m_bvhAabbMin;
b3Vector3 m_bvhAabbMax;
b3Vector3 m_bvhQuantization;
b3Vector3 m_bvhAabbMin;
b3Vector3 m_bvhAabbMax;
b3Vector3 m_bvhQuantization;
protected:
int m_bulletVersion; //for serialization versioning. It could also be used to detect endianess.
int m_bulletVersion; //for serialization versioning. It could also be used to detect endianess.
int m_curNodeIndex;
int m_curNodeIndex;
//quantization data
bool m_useQuantization;
bool m_useQuantization;
NodeArray m_leafNodes;
NodeArray m_contiguousNodes;
QuantizedNodeArray m_quantizedLeafNodes;
QuantizedNodeArray m_quantizedContiguousNodes;
NodeArray m_leafNodes;
NodeArray m_contiguousNodes;
QuantizedNodeArray m_quantizedLeafNodes;
QuantizedNodeArray m_quantizedContiguousNodes;
b3TraversalMode m_traversalMode;
BvhSubtreeInfoArray m_SubtreeHeaders;
b3TraversalMode m_traversalMode;
BvhSubtreeInfoArray m_SubtreeHeaders;
//This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray
mutable int m_subtreeHeaderCount;
///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
///this might be refactored into a virtual, it is usually not calculated at run-time
void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
{
if (m_useQuantization)
{
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0);
} else
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0], aabbMin, 0);
}
else
{
m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
}
}
void setInternalNodeAabbMax(int nodeIndex,const b3Vector3& aabbMax)
void setInternalNodeAabbMax(int nodeIndex, const b3Vector3& aabbMax)
{
if (m_useQuantization)
{
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1);
} else
quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0], aabbMax, 1);
}
else
{
m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
}
@@ -232,115 +217,102 @@ protected:
}
//non-quantized
return m_leafNodes[nodeIndex].m_aabbMinOrg;
}
b3Vector3 getAabbMax(int nodeIndex) const
{
if (m_useQuantization)
{
return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
}
}
//non-quantized
return m_leafNodes[nodeIndex].m_aabbMaxOrg;
}
void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
{
if (m_useQuantization)
{
m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
}
}
else
{
m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
}
}
void mergeInternalNodeAabb(int nodeIndex,const b3Vector3& newAabbMin,const b3Vector3& newAabbMax)
void mergeInternalNodeAabb(int nodeIndex, const b3Vector3& newAabbMin, const b3Vector3& newAabbMax)
{
if (m_useQuantization)
{
unsigned short int quantizedAabbMin[3];
unsigned short int quantizedAabbMax[3];
quantize(quantizedAabbMin,newAabbMin,0);
quantize(quantizedAabbMax,newAabbMax,1);
for (int i=0;i<3;i++)
quantize(quantizedAabbMin, newAabbMin, 0);
quantize(quantizedAabbMax, newAabbMax, 1);
for (int i = 0; i < 3; i++)
{
if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];
if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
}
} else
}
else
{
//non-quantized
m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);
m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);
}
}
void swapLeafNodes(int firstIndex,int secondIndex);
void swapLeafNodes(int firstIndex, int secondIndex);
void assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex);
void assignInternalNodeFromLeafNode(int internalNode, int leafNodeIndex);
protected:
void buildTree(int startIndex, int endIndex);
int calcSplittingAxis(int startIndex, int endIndex);
void buildTree (int startIndex,int endIndex);
int sortAndCalcSplittingIndex(int startIndex, int endIndex, int splitAxis);
int calcSplittingAxis(int startIndex,int endIndex);
void walkStacklessTree(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
int sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis);
void walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
void walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const;
void walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;
void walkStacklessQuantizedTree(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax, int startNodeIndex, int endNodeIndex) const;
void walkStacklessTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;
///tree traversal designed for small-memory processors like PS3 SPU
void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;
///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode, b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;
///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA,const b3QuantizedBvhNode* treeNodeB,b3NodeOverlapCallback* nodeCallback) const;
void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA, const b3QuantizedBvhNode* treeNodeB, b3NodeOverlapCallback* nodeCallback) const;
void updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex);
void updateSubtreeHeaders(int leftChildNodexIndex, int rightChildNodexIndex);
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3QuantizedBvh();
virtual ~b3QuantizedBvh();
///***************************************** expert/internal use only *************************
void setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0));
QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; }
void setQuantizationValues(const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax, b3Scalar quantizationMargin = b3Scalar(1.0));
QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; }
///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
void buildInternal();
void buildInternal();
///***************************************** expert/internal use only *************************
void reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
void reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
void reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
void reportAabbOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
void reportRayOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
void reportBoxCastOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point,int isMax) const
B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point, int isMax) const
{
b3Assert(m_useQuantization);
b3Assert(point.getX() <= m_bvhAabbMax.getX());
@@ -357,16 +329,16 @@ public:
///@todo: double-check this
if (isMax)
{
out[0] = (unsigned short) (((unsigned short)(v.getX()+b3Scalar(1.)) | 1));
out[1] = (unsigned short) (((unsigned short)(v.getY()+b3Scalar(1.)) | 1));
out[2] = (unsigned short) (((unsigned short)(v.getZ()+b3Scalar(1.)) | 1));
} else
{
out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe));
out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe));
out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe));
out[0] = (unsigned short)(((unsigned short)(v.getX() + b3Scalar(1.)) | 1));
out[1] = (unsigned short)(((unsigned short)(v.getY() + b3Scalar(1.)) | 1));
out[2] = (unsigned short)(((unsigned short)(v.getZ() + b3Scalar(1.)) | 1));
}
else
{
out[0] = (unsigned short)(((unsigned short)(v.getX()) & 0xfffe));
out[1] = (unsigned short)(((unsigned short)(v.getY()) & 0xfffe));
out[2] = (unsigned short)(((unsigned short)(v.getZ()) & 0xfffe));
}
#ifdef DEBUG_CHECK_DEQUANTIZATION
b3Vector3 newPoint = unQuantize(out);
@@ -374,105 +346,97 @@ public:
{
if (newPoint.getX() < point.getX())
{
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
}
if (newPoint.getY() < point.getY())
{
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
}
if (newPoint.getZ() < point.getZ())
{
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
}
} else
}
else
{
if (newPoint.getX() > point.getX())
{
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
}
if (newPoint.getY() > point.getY())
{
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
}
if (newPoint.getZ() > point.getZ())
{
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
}
}
#endif //DEBUG_CHECK_DEQUANTIZATION
#endif //DEBUG_CHECK_DEQUANTIZATION
}
B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2,int isMax) const
B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2, int isMax) const
{
b3Assert(m_useQuantization);
b3Vector3 clampedPoint(point2);
clampedPoint.setMax(m_bvhAabbMin);
clampedPoint.setMin(m_bvhAabbMax);
quantize(out,clampedPoint,isMax);
quantize(out, clampedPoint, isMax);
}
B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const
B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const
{
b3Vector3 vecOut;
vecOut.setValue(
b3Vector3 vecOut;
vecOut.setValue(
(b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()),
(b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()),
(b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
vecOut += m_bvhAabbMin;
return vecOut;
vecOut += m_bvhAabbMin;
return vecOut;
}
///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
void setTraversalMode(b3TraversalMode traversalMode)
void setTraversalMode(b3TraversalMode traversalMode)
{
m_traversalMode = traversalMode;
}
B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray()
{
return m_quantizedContiguousNodes;
B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray()
{
return m_quantizedContiguousNodes;
}
B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray()
B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray()
{
return m_SubtreeHeaders;
}
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
/////Calculate space needed to store BVH for serialization
unsigned calculateSerializeBufferSize() const;
/// Data buffer MUST be 16 byte aligned
virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
virtual bool serialize(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
static b3QuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
static b3QuantizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
static unsigned int getAlignmentSerializationPadding();
//////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////
virtual int calculateSerializeBufferSizeNew() const;
virtual int calculateSerializeBufferSizeNew() const;
///fills the dataBuffer and returns the struct name (and 0 on failure)
virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
virtual void deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData);
virtual void deSerializeFloat(struct b3QuantizedBvhFloatData & quantizedBvhFloatData);
virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData);
virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData & quantizedBvhDoubleData);
////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////
B3_FORCE_INLINE bool isQuantized()
{
@@ -483,74 +447,65 @@ private:
// Special "copy" constructor that allows for in-place deserialization
// Prevents b3Vector3's default constructor from being called, but doesn't inialize much else
// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
b3QuantizedBvh(b3QuantizedBvh &other, bool ownsMemory);
}
;
b3QuantizedBvh(b3QuantizedBvh & other, bool ownsMemory);
};
struct b3OptimizedBvhNodeFloatData
{
b3Vector3FloatData m_aabbMinOrg;
b3Vector3FloatData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
b3Vector3FloatData m_aabbMinOrg;
b3Vector3FloatData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
char m_pad[4];
};
struct b3OptimizedBvhNodeDoubleData
{
b3Vector3DoubleData m_aabbMinOrg;
b3Vector3DoubleData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
char m_pad[4];
b3Vector3DoubleData m_aabbMinOrg;
b3Vector3DoubleData m_aabbMaxOrg;
int m_escapeIndex;
int m_subPart;
int m_triangleIndex;
char m_pad[4];
};
struct b3QuantizedBvhFloatData
struct b3QuantizedBvhFloatData
{
b3Vector3FloatData m_bvhAabbMin;
b3Vector3FloatData m_bvhAabbMax;
b3Vector3FloatData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeFloatData *m_contiguousNodesPtr;
b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr;
b3BvhSubtreeInfoData *m_subTreeInfoPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
b3Vector3FloatData m_bvhAabbMin;
b3Vector3FloatData m_bvhAabbMax;
b3Vector3FloatData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeFloatData* m_contiguousNodesPtr;
b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;
b3BvhSubtreeInfoData* m_subTreeInfoPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
};
struct b3QuantizedBvhDoubleData
struct b3QuantizedBvhDoubleData
{
b3Vector3DoubleData m_bvhAabbMin;
b3Vector3DoubleData m_bvhAabbMax;
b3Vector3DoubleData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeDoubleData *m_contiguousNodesPtr;
b3QuantizedBvhNodeData *m_quantizedContiguousNodesPtr;
b3Vector3DoubleData m_bvhAabbMin;
b3Vector3DoubleData m_bvhAabbMax;
b3Vector3DoubleData m_bvhQuantization;
int m_curNodeIndex;
int m_useQuantization;
int m_numContiguousLeafNodes;
int m_numQuantizedContiguousNodes;
b3OptimizedBvhNodeDoubleData* m_contiguousNodesPtr;
b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
b3BvhSubtreeInfoData *m_subTreeInfoPtr;
int m_traversalMode;
int m_numSubtreeHeaders;
b3BvhSubtreeInfoData* m_subTreeInfoPtr;
};
B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const
B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const
{
return sizeof(b3QuantizedBvhData);
}
#endif //B3_QUANTIZED_BVH_H
#endif //B3_QUANTIZED_BVH_H

View File

@@ -15,35 +15,32 @@ subject to the following restrictions:
#include "b3StridingMeshInterface.h"
b3StridingMeshInterface::~b3StridingMeshInterface()
{
}
void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
{
(void)aabbMin;
(void)aabbMax;
int numtotalphysicsverts = 0;
int part,graphicssubparts = getNumSubParts();
const unsigned char * vertexbase;
const unsigned char * indexbase;
int part, graphicssubparts = getNumSubParts();
const unsigned char* vertexbase;
const unsigned char* indexbase;
int indexstride;
PHY_ScalarType type;
PHY_ScalarType gfxindextype;
int stride,numverts,numtriangles;
int stride, numverts, numtriangles;
int gfxindex;
b3Vector3 triangle[3];
b3Vector3 meshScaling = getScaling();
///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
for (part=0;part<graphicssubparts ;part++)
for (part = 0; part < graphicssubparts; part++)
{
getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part);
numtotalphysicsverts+=numtriangles*3; //upper bound
getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numtriangles, gfxindextype, part);
numtotalphysicsverts += numtriangles * 3; //upper bound
///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
///so disable this feature by default
@@ -51,143 +48,141 @@ void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleInde
switch (type)
{
case PHY_FLOAT:
{
case PHY_FLOAT:
{
float* graphicsbase;
float* graphicsbase;
switch (gfxindextype)
{
case PHY_INTEGER:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_SHORT:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_UCHAR:
{
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
}
break;
}
switch (gfxindextype)
{
case PHY_INTEGER:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
case PHY_SHORT:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
case PHY_UCHAR:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
{
unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(), graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
}
break;
}
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
}
break;
}
case PHY_DOUBLE:
case PHY_DOUBLE:
{
double* graphicsbase;
switch (gfxindextype)
{
case PHY_INTEGER:
case PHY_INTEGER:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_SHORT:
case PHY_SHORT:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
case PHY_UCHAR:
case PHY_UCHAR:
{
for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
{
unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(), (b3Scalar)graphicsbase[2]*meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle,part,gfxindex);
unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
callback->internalProcessTriangleIndex(triangle, part, gfxindex);
}
break;
}
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
default:
b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
}
break;
}
default:
b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
default:
b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
}
unLockReadOnlyVertexBase(part);
}
}
void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax)
void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin, b3Vector3& aabbMax)
{
struct AabbCalculationCallback : public b3InternalTriangleIndexCallback
struct AabbCalculationCallback : public b3InternalTriangleIndexCallback
{
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
b3Vector3 m_aabbMin;
b3Vector3 m_aabbMax;
AabbCalculationCallback()
{
m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
}
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex)
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
{
(void)partId;
(void)triangleIndex;
@@ -202,13 +197,11 @@ void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vecto
};
//first calculate the total aabb for all triangles
AabbCalculationCallback aabbCallback;
aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax);
AabbCalculationCallback aabbCallback;
aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
InternalProcessAllTriangles(&aabbCallback, aabbMin, aabbMax);
aabbMin = aabbCallback.m_aabbMin;
aabbMax = aabbCallback.m_aabbMax;
}

View File

@@ -20,148 +20,139 @@ subject to the following restrictions:
#include "b3TriangleCallback.h"
//#include "b3ConcaveShape.h"
enum PHY_ScalarType {
PHY_FLOAT, PHY_DOUBLE, PHY_INTEGER, PHY_SHORT,
PHY_FIXEDPOINT88, PHY_UCHAR
enum PHY_ScalarType
{
PHY_FLOAT,
PHY_DOUBLE,
PHY_INTEGER,
PHY_SHORT,
PHY_FIXEDPOINT88,
PHY_UCHAR
};
/// The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes.
/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
B3_ATTRIBUTE_ALIGNED16(class ) b3StridingMeshInterface
B3_ATTRIBUTE_ALIGNED16(class)
b3StridingMeshInterface
{
protected:
b3Vector3 m_scaling;
protected:
b3Vector3 m_scaling;
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3StridingMeshInterface() :m_scaling(b3MakeVector3(b3Scalar(1.),b3Scalar(1.),b3Scalar(1.)))
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
}
b3StridingMeshInterface() : m_scaling(b3MakeVector3(b3Scalar(1.), b3Scalar(1.), b3Scalar(1.)))
{
}
virtual ~b3StridingMeshInterface();
virtual ~b3StridingMeshInterface();
virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback * callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
///brute force method to calculate aabb
void calculateAabbBruteForce(b3Vector3 & aabbMin, b3Vector3 & aabbMax);
virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
/// get read and write access to a subpart of a triangle mesh
/// this subpart has a continuous array of vertices and indices
/// in this way the mesh can be handled as chunks of memory with striding
/// very similar to OpenGL vertexarray support
/// make a call to unLockVertexBase when the read and write access is finished
virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) = 0;
///brute force method to calculate aabb
void calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax);
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const = 0;
/// get read and write access to a subpart of a triangle mesh
/// this subpart has a continuous array of vertices and indices
/// in this way the mesh can be handled as chunks of memory with striding
/// very similar to OpenGL vertexarray support
/// make a call to unLockVertexBase when the read and write access is finished
virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0;
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0;
/// unLockVertexBase finishes the access to a subpart of the triangle mesh
/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
virtual void unLockVertexBase(int subpart)=0;
/// unLockVertexBase finishes the access to a subpart of the triangle mesh
/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
virtual void unLockVertexBase(int subpart) = 0;
virtual void unLockReadOnlyVertexBase(int subpart) const=0;
virtual void unLockReadOnlyVertexBase(int subpart) const = 0;
/// getNumSubParts returns the number of seperate subparts
/// each subpart has a continuous array of vertices and indices
virtual int getNumSubParts() const = 0;
/// getNumSubParts returns the number of seperate subparts
/// each subpart has a continuous array of vertices and indices
virtual int getNumSubParts() const=0;
virtual void preallocateVertices(int numverts) = 0;
virtual void preallocateIndices(int numindices) = 0;
virtual void preallocateVertices(int numverts)=0;
virtual void preallocateIndices(int numindices)=0;
virtual bool hasPremadeAabb() const { return false; }
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
{
(void)aabbMin;
(void)aabbMax;
}
virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const
{
(void)aabbMin;
(void)aabbMax;
}
virtual bool hasPremadeAabb() const { return false; }
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
{
(void) aabbMin;
(void) aabbMax;
}
virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
{
(void) aabbMin;
(void) aabbMax;
}
const b3Vector3& getScaling() const {
return m_scaling;
}
void setScaling(const b3Vector3& scaling)
{
m_scaling = scaling;
}
virtual int calculateSerializeBufferSize() const;
///fills the dataBuffer and returns the struct name (and 0 on failure)
//virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
const b3Vector3& getScaling() const
{
return m_scaling;
}
void setScaling(const b3Vector3& scaling)
{
m_scaling = scaling;
}
virtual int calculateSerializeBufferSize() const;
///fills the dataBuffer and returns the struct name (and 0 on failure)
//virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;
};
struct b3IntIndexData
struct b3IntIndexData
{
int m_value;
int m_value;
};
struct b3ShortIntIndexData
struct b3ShortIntIndexData
{
short m_value;
char m_pad[2];
};
struct b3ShortIntIndexTripletData
struct b3ShortIntIndexTripletData
{
short m_values[3];
char m_pad[2];
short m_values[3];
char m_pad[2];
};
struct b3CharIndexTripletData
struct b3CharIndexTripletData
{
unsigned char m_values[3];
char m_pad;
char m_pad;
};
///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
struct b3MeshPartData
struct b3MeshPartData
{
b3Vector3FloatData *m_vertices3f;
b3Vector3DoubleData *m_vertices3d;
b3Vector3FloatData* m_vertices3f;
b3Vector3DoubleData* m_vertices3d;
b3IntIndexData *m_indices32;
b3ShortIntIndexTripletData *m_3indices16;
b3CharIndexTripletData *m_3indices8;
b3IntIndexData* m_indices32;
b3ShortIntIndexTripletData* m_3indices16;
b3CharIndexTripletData* m_3indices8;
b3ShortIntIndexData *m_indices16;//backwards compatibility
b3ShortIntIndexData* m_indices16; //backwards compatibility
int m_numTriangles;//length of m_indices = m_numTriangles
int m_numVertices;
int m_numTriangles; //length of m_indices = m_numTriangles
int m_numVertices;
};
///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
struct b3StridingMeshInterfaceData
struct b3StridingMeshInterfaceData
{
b3MeshPartData *m_meshPartsPtr;
b3Vector3FloatData m_scaling;
int m_numMeshParts;
b3MeshPartData* m_meshPartsPtr;
b3Vector3FloatData m_scaling;
int m_numMeshParts;
char m_padding[4];
};
B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const
B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const
{
return sizeof(b3StridingMeshInterfaceData);
}
#endif //B3_STRIDING_MESHINTERFACE_H
#endif //B3_STRIDING_MESHINTERFACE_H

View File

@@ -6,33 +6,29 @@
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "b3VectorFloat4.h"
struct b3GjkPairDetector;
inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull,
const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
{
b3Vector3 supVec = b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
b3Vector3 supVec = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT);
// Here we take advantage of dot(a, b*c) = dot(a*b, c). Note: This is true mathematically, but not numerically.
if( 0 < hull->m_numVertices )
{
const b3Vector3 scaled = supportVec;
int index = (int) scaled.maxDot( &verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot);
return verticesA[hull->m_vertexOffset+index];
}
return supVec;
// Here we take advantage of dot(a, b*c) = dot(a*b, c). Note: This is true mathematically, but not numerically.
if (0 < hull->m_numVertices)
{
const b3Vector3 scaled = supportVec;
int index = (int)scaled.maxDot(&verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot);
return verticesA[hull->m_vertexOffset + index];
}
return supVec;
}
inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull,
const b3AlignedObjectArray<b3Vector3>& verticesA)
inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
const b3AlignedObjectArray<b3Vector3>& verticesA)
{
return localGetSupportVertexWithMargin(supportVec,hull,verticesA,0.f);
return localGetSupportVertexWithMargin(supportVec, hull, verticesA, 0.f);
}
#endif //B3_SUPPORT_MAPPINGS_H
#endif //B3_SUPPORT_MAPPINGS_H

View File

@@ -17,12 +17,8 @@ subject to the following restrictions:
b3TriangleCallback::~b3TriangleCallback()
{
}
b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback()
{
}

View File

@@ -18,13 +18,11 @@ subject to the following restrictions:
#include "Bullet3Common/b3Vector3.h"
///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape.
class b3TriangleCallback
{
public:
virtual ~b3TriangleCallback();
virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0;
};
@@ -32,11 +30,8 @@ public:
class b3InternalTriangleIndexCallback
{
public:
virtual ~b3InternalTriangleIndexCallback();
virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int triangleIndex) = 0;
virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) = 0;
};
#endif //B3_TRIANGLE_CALLBACK_H
#endif //B3_TRIANGLE_CALLBACK_H

View File

@@ -15,81 +15,76 @@ subject to the following restrictions:
#include "b3TriangleIndexVertexArray.h"
b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride)
: m_hasAabb(0)
b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride)
: m_hasAabb(0)
{
b3IndexedMesh mesh;
mesh.m_numTriangles = numTriangles;
mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase;
mesh.m_triangleIndexBase = (const unsigned char*)triangleIndexBase;
mesh.m_triangleIndexStride = triangleIndexStride;
mesh.m_numVertices = numVertices;
mesh.m_vertexBase = (const unsigned char *)vertexBase;
mesh.m_vertexBase = (const unsigned char*)vertexBase;
mesh.m_vertexStride = vertexStride;
addIndexedMesh(mesh);
}
b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray()
{
}
void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart)
void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart)
{
b3Assert(subpart< getNumSubParts() );
b3Assert(subpart < getNumSubParts());
b3IndexedMesh& mesh = m_indexedMeshes[subpart];
numverts = mesh.m_numVertices;
(*vertexbase) = (unsigned char *) mesh.m_vertexBase;
(*vertexbase) = (unsigned char*)mesh.m_vertexBase;
type = mesh.m_vertexType;
type = mesh.m_vertexType;
vertexStride = mesh.m_vertexStride;
numfaces = mesh.m_numTriangles;
(*indexbase) = (unsigned char *)mesh.m_triangleIndexBase;
(*indexbase) = (unsigned char*)mesh.m_triangleIndexBase;
indexstride = mesh.m_triangleIndexStride;
indicestype = mesh.m_indexType;
}
void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const
void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart) const
{
const b3IndexedMesh& mesh = m_indexedMeshes[subpart];
numverts = mesh.m_numVertices;
(*vertexbase) = (const unsigned char *)mesh.m_vertexBase;
(*vertexbase) = (const unsigned char*)mesh.m_vertexBase;
type = mesh.m_vertexType;
type = mesh.m_vertexType;
vertexStride = mesh.m_vertexStride;
numfaces = mesh.m_numTriangles;
(*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase;
(*indexbase) = (const unsigned char*)mesh.m_triangleIndexBase;
indexstride = mesh.m_triangleIndexStride;
indicestype = mesh.m_indexType;
}
bool b3TriangleIndexVertexArray::hasPremadeAabb() const
bool b3TriangleIndexVertexArray::hasPremadeAabb() const
{
return (m_hasAabb == 1);
}
void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
{
m_aabbMin = aabbMin;
m_aabbMax = aabbMax;
m_hasAabb = 1; // this is intentionally an int see notes in header
m_hasAabb = 1; // this is intentionally an int see notes in header
}
void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax) const
{
*aabbMin = m_aabbMin;
*aabbMax = m_aabbMax;
}

View File

@@ -20,62 +20,59 @@ subject to the following restrictions:
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Common/b3Scalar.h"
///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh.
///Instead of the number of indices, we pass the number of triangles.
B3_ATTRIBUTE_ALIGNED16( struct) b3IndexedMesh
B3_ATTRIBUTE_ALIGNED16(struct)
b3IndexedMesh
{
B3_DECLARE_ALIGNED_ALLOCATOR();
int m_numTriangles;
const unsigned char * m_triangleIndexBase;
// Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
int m_triangleIndexStride;
int m_numVertices;
const unsigned char * m_vertexBase;
// Size of a vertex, in bytes
int m_vertexStride;
int m_numTriangles;
const unsigned char* m_triangleIndexBase;
// Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
int m_triangleIndexStride;
int m_numVertices;
const unsigned char* m_vertexBase;
// Size of a vertex, in bytes
int m_vertexStride;
// The index type is set when adding an indexed mesh to the
// b3TriangleIndexVertexArray, do not set it manually
PHY_ScalarType m_indexType;
// The index type is set when adding an indexed mesh to the
// b3TriangleIndexVertexArray, do not set it manually
PHY_ScalarType m_indexType;
// The vertex type has a default type similar to Bullet's precision mode (float or double)
// but can be set manually if you for example run Bullet with double precision but have
// mesh data in single precision..
PHY_ScalarType m_vertexType;
// The vertex type has a default type similar to Bullet's precision mode (float or double)
// but can be set manually if you for example run Bullet with double precision but have
// mesh data in single precision..
PHY_ScalarType m_vertexType;
b3IndexedMesh()
:m_indexType(PHY_INTEGER),
b3IndexedMesh()
: m_indexType(PHY_INTEGER),
#ifdef B3_USE_DOUBLE_PRECISION
m_vertexType(PHY_DOUBLE)
#else // B3_USE_DOUBLE_PRECISION
m_vertexType(PHY_FLOAT)
#endif // B3_USE_DOUBLE_PRECISION
{
}
}
;
m_vertexType(PHY_DOUBLE)
#else // B3_USE_DOUBLE_PRECISION
m_vertexType(PHY_FLOAT)
#endif // B3_USE_DOUBLE_PRECISION
{
}
};
typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray;
typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray;
///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
///Additional meshes can be added using addIndexedMesh
///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray.
B3_ATTRIBUTE_ALIGNED16( class) b3TriangleIndexVertexArray : public b3StridingMeshInterface
B3_ATTRIBUTE_ALIGNED16(class)
b3TriangleIndexVertexArray : public b3StridingMeshInterface
{
protected:
IndexedMeshArray m_indexedMeshes;
IndexedMeshArray m_indexedMeshes;
int m_pad[2];
mutable int m_hasAabb; // using int instead of bool to maintain alignment
mutable int m_hasAabb; // using int instead of bool to maintain alignment
mutable b3Vector3 m_aabbMin;
mutable b3Vector3 m_aabbMax;
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
b3TriangleIndexVertexArray() : m_hasAabb(0)
@@ -85,49 +82,47 @@ public:
virtual ~b3TriangleIndexVertexArray();
//just to be backwards compatible
b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride);
void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride);
void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
{
m_indexedMeshes.push_back(mesh);
m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType;
m_indexedMeshes[m_indexedMeshes.size() - 1].m_indexType = indexType;
}
virtual void getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0);
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const;
virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0);
virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const;
/// unLockVertexBase finishes the access to a subpart of the triangle mesh
/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
virtual void unLockVertexBase(int subpart) {(void)subpart;}
virtual void unLockVertexBase(int subpart) { (void)subpart; }
virtual void unLockReadOnlyVertexBase(int subpart) const {(void)subpart;}
virtual void unLockReadOnlyVertexBase(int subpart) const { (void)subpart; }
/// getNumSubParts returns the number of seperate subparts
/// each subpart has a continuous array of vertices and indices
virtual int getNumSubParts() const {
virtual int getNumSubParts() const
{
return (int)m_indexedMeshes.size();
}
IndexedMeshArray& getIndexedMeshArray()
IndexedMeshArray& getIndexedMeshArray()
{
return m_indexedMeshes;
}
const IndexedMeshArray& getIndexedMeshArray() const
const IndexedMeshArray& getIndexedMeshArray() const
{
return m_indexedMeshes;
}
virtual void preallocateVertices(int numverts){(void) numverts;}
virtual void preallocateIndices(int numindices){(void) numindices;}
virtual void preallocateVertices(int numverts) { (void)numverts; }
virtual void preallocateIndices(int numindices) { (void)numindices; }
virtual bool hasPremadeAabb() const;
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const;
virtual void getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const;
virtual bool hasPremadeAabb() const;
virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const;
};
}
;
#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H

View File

@@ -7,5 +7,4 @@
#define float4 b3Vector3
//#define make_float4(x,y,z,w) b3Vector4(x,y,z,w)
#endif //B3_VECTOR_FLOAT4_H
#endif //B3_VECTOR_FLOAT4_H

View File

@@ -23,26 +23,24 @@ subject to the following restrictions:
*/
#include "b3VoronoiSimplexSolver.h"
#define VERTA 0
#define VERTB 1
#define VERTC 2
#define VERTD 3
#define VERTA 0
#define VERTB 1
#define VERTC 2
#define VERTD 3
#define B3_CATCH_DEGENERATE_TETRAHEDRON 1
void b3VoronoiSimplexSolver::removeVertex(int index)
void b3VoronoiSimplexSolver::removeVertex(int index)
{
b3Assert(m_numVertices>0);
b3Assert(m_numVertices > 0);
m_numVertices--;
m_simplexVectorW[index] = m_simplexVectorW[m_numVertices];
m_simplexPointsP[index] = m_simplexPointsP[m_numVertices];
m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices];
}
void b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts)
void b3VoronoiSimplexSolver::reduceVertices(const b3UsageBitfield& usedVerts)
{
if ((numVertices() >= 4) && (!usedVerts.usedVertexD))
removeVertex(3);
@@ -52,29 +50,22 @@ void b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts)
if ((numVertices() >= 2) && (!usedVerts.usedVertexB))
removeVertex(1);
if ((numVertices() >= 1) && (!usedVerts.usedVertexA))
removeVertex(0);
}
//clear the simplex, remove all the vertices
void b3VoronoiSimplexSolver::reset()
{
m_cachedValidClosest = false;
m_numVertices = 0;
m_needsUpdate = true;
m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
m_cachedBC.reset();
}
//add a vertex
//add a vertex
void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q)
{
m_lastW = w;
@@ -87,9 +78,8 @@ void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, c
m_numVertices++;
}
bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
{
if (m_needsUpdate)
{
m_cachedBC.reset();
@@ -98,127 +88,131 @@ bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
switch (numVertices())
{
case 0:
case 0:
m_cachedValidClosest = false;
break;
case 1:
case 1:
{
m_cachedP1 = m_simplexPointsP[0];
m_cachedP2 = m_simplexPointsQ[0];
m_cachedV = m_cachedP1-m_cachedP2; //== m_simplexVectorW[0]
m_cachedV = m_cachedP1 - m_cachedP2; //== m_simplexVectorW[0]
m_cachedBC.reset();
m_cachedBC.setBarycentricCoordinates(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
m_cachedBC.setBarycentricCoordinates(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
m_cachedValidClosest = m_cachedBC.isValid();
break;
};
case 2:
case 2:
{
//closest point origin from line segment
const b3Vector3& from = m_simplexVectorW[0];
const b3Vector3& to = m_simplexVectorW[1];
b3Vector3 nearest;
//closest point origin from line segment
const b3Vector3& from = m_simplexVectorW[0];
const b3Vector3& to = m_simplexVectorW[1];
b3Vector3 nearest;
b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
b3Vector3 diff = p - from;
b3Vector3 v = to - from;
b3Scalar t = v.dot(diff);
if (t > 0) {
b3Scalar dotVV = v.dot(v);
if (t < dotVV) {
t /= dotVV;
diff -= t*v;
m_cachedBC.m_usedVertices.usedVertexA = true;
m_cachedBC.m_usedVertices.usedVertexB = true;
} else {
t = 1;
diff -= v;
//reduce to 1 point
m_cachedBC.m_usedVertices.usedVertexB = true;
}
} else
b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
b3Vector3 diff = p - from;
b3Vector3 v = to - from;
b3Scalar t = v.dot(diff);
if (t > 0)
{
b3Scalar dotVV = v.dot(v);
if (t < dotVV)
{
t = 0;
//reduce to 1 point
t /= dotVV;
diff -= t * v;
m_cachedBC.m_usedVertices.usedVertexA = true;
m_cachedBC.m_usedVertices.usedVertexB = true;
}
m_cachedBC.setBarycentricCoordinates(1-t,t);
nearest = from + t*v;
else
{
t = 1;
diff -= v;
//reduce to 1 point
m_cachedBC.m_usedVertices.usedVertexB = true;
}
}
else
{
t = 0;
//reduce to 1 point
m_cachedBC.m_usedVertices.usedVertexA = true;
}
m_cachedBC.setBarycentricCoordinates(1 - t, t);
nearest = from + t * v;
m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
m_cachedV = m_cachedP1 - m_cachedP2;
reduceVertices(m_cachedBC.m_usedVertices);
m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
m_cachedV = m_cachedP1 - m_cachedP2;
m_cachedValidClosest = m_cachedBC.isValid();
break;
reduceVertices(m_cachedBC.m_usedVertices);
m_cachedValidClosest = m_cachedBC.isValid();
break;
}
case 3:
{
//closest point origin from triangle
b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
const b3Vector3& a = m_simplexVectorW[0];
const b3Vector3& b = m_simplexVectorW[1];
const b3Vector3& c = m_simplexVectorW[2];
closestPtPointTriangle(p,a,b,c,m_cachedBC);
m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2];
m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2];
m_cachedV = m_cachedP1-m_cachedP2;
reduceVertices (m_cachedBC.m_usedVertices);
m_cachedValidClosest = m_cachedBC.isValid();
break;
}
case 4:
case 3:
{
//closest point origin from triangle
b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
const b3Vector3& a = m_simplexVectorW[0];
const b3Vector3& b = m_simplexVectorW[1];
const b3Vector3& c = m_simplexVectorW[2];
closestPtPointTriangle(p, a, b, c, m_cachedBC);
m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2];
m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2];
m_cachedV = m_cachedP1 - m_cachedP2;
reduceVertices(m_cachedBC.m_usedVertices);
m_cachedValidClosest = m_cachedBC.isValid();
break;
}
case 4:
{
b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
const b3Vector3& a = m_simplexVectorW[0];
const b3Vector3& b = m_simplexVectorW[1];
const b3Vector3& c = m_simplexVectorW[2];
const b3Vector3& d = m_simplexVectorW[3];
bool hasSeperation = closestPtPointTetrahedron(p,a,b,c,d,m_cachedBC);
bool hasSeperation = closestPtPointTetrahedron(p, a, b, c, d, m_cachedBC);
if (hasSeperation)
{
m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
m_cachedV = m_cachedP1-m_cachedP2;
reduceVertices (m_cachedBC.m_usedVertices);
} else
m_cachedV = m_cachedP1 - m_cachedP2;
reduceVertices(m_cachedBC.m_usedVertices);
}
else
{
// printf("sub distance got penetration\n");
// printf("sub distance got penetration\n");
if (m_cachedBC.m_degenerate)
{
m_cachedValidClosest = false;
} else
}
else
{
m_cachedValidClosest = true;
//degenerate case == false, penetration = true + zero
m_cachedV.setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
m_cachedV.setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
}
break;
}
@@ -228,7 +222,7 @@ bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
//closest point origin from tetrahedron
break;
}
default:
default:
{
m_cachedValidClosest = false;
}
@@ -236,7 +230,6 @@ bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
}
return m_cachedValidClosest;
}
//return/calculate the closest vertex
@@ -247,13 +240,11 @@ bool b3VoronoiSimplexSolver::closest(b3Vector3& v)
return succes;
}
b3Scalar b3VoronoiSimplexSolver::maxVertex()
{
int i, numverts = numVertices();
b3Scalar maxV = b3Scalar(0.);
for (i=0;i<numverts;i++)
for (i = 0; i < numverts; i++)
{
b3Scalar curLen2 = m_simplexVectorW[i].length2();
if (maxV < curLen2)
@@ -262,13 +253,11 @@ b3Scalar b3VoronoiSimplexSolver::maxVertex()
return maxV;
}
//return the current simplex
int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const
//return the current simplex
int b3VoronoiSimplexSolver::getSimplex(b3Vector3* pBuf, b3Vector3* qBuf, b3Vector3* yBuf) const
{
int i;
for (i=0;i<numVertices();i++)
for (i = 0; i < numVertices(); i++)
{
yBuf[i] = m_simplexVectorW[i];
pBuf[i] = m_simplexPointsP[i];
@@ -277,20 +266,17 @@ int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vecto
return numVertices();
}
bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
{
bool found = false;
int i, numverts = numVertices();
//b3Scalar maxV = b3Scalar(0.);
//w is in the current (reduced) simplex
for (i=0;i<numverts;i++)
for (i = 0; i < numverts; i++)
{
#ifdef BT_USE_EQUAL_VERTEX_THRESHOLD
if ( m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
if (m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
#else
if (m_simplexVectorW[i] == w)
#endif
@@ -300,199 +286,190 @@ bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
//check in case lastW is already removed
if (w == m_lastW)
return true;
return found;
}
void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v)
void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v)
{
v = m_cachedV;
}
bool b3VoronoiSimplexSolver::emptySimplex() const
bool b3VoronoiSimplexSolver::emptySimplex() const
{
return (numVertices() == 0);
}
void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2)
void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2)
{
updateClosestVectorAndPoints();
p1 = m_cachedP1;
p2 = m_cachedP2;
}
bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result)
bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result)
{
result.m_usedVertices.reset();
// Check if P in vertex region outside A
b3Vector3 ab = b - a;
b3Vector3 ac = c - a;
b3Vector3 ap = p - a;
b3Scalar d1 = ab.dot(ap);
b3Scalar d2 = ac.dot(ap);
if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0))
// Check if P in vertex region outside A
b3Vector3 ab = b - a;
b3Vector3 ac = c - a;
b3Vector3 ap = p - a;
b3Scalar d1 = ab.dot(ap);
b3Scalar d2 = ac.dot(ap);
if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0))
{
result.m_closestPointOnSimplex = a;
result.m_usedVertices.usedVertexA = true;
result.setBarycentricCoordinates(1,0,0);
return true;// a; // barycentric coordinates (1,0,0)
result.setBarycentricCoordinates(1, 0, 0);
return true; // a; // barycentric coordinates (1,0,0)
}
// Check if P in vertex region outside B
b3Vector3 bp = p - b;
b3Scalar d3 = ab.dot(bp);
b3Scalar d4 = ac.dot(bp);
if (d3 >= b3Scalar(0.0) && d4 <= d3)
// Check if P in vertex region outside B
b3Vector3 bp = p - b;
b3Scalar d3 = ab.dot(bp);
b3Scalar d4 = ac.dot(bp);
if (d3 >= b3Scalar(0.0) && d4 <= d3)
{
result.m_closestPointOnSimplex = b;
result.m_usedVertices.usedVertexB = true;
result.setBarycentricCoordinates(0,1,0);
result.setBarycentricCoordinates(0, 1, 0);
return true; // b; // barycentric coordinates (0,1,0)
return true; // b; // barycentric coordinates (0,1,0)
}
// Check if P in edge region of AB, if so return projection of P onto AB
b3Scalar vc = d1*d4 - d3*d2;
if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0)) {
b3Scalar v = d1 / (d1 - d3);
// Check if P in edge region of AB, if so return projection of P onto AB
b3Scalar vc = d1 * d4 - d3 * d2;
if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0))
{
b3Scalar v = d1 / (d1 - d3);
result.m_closestPointOnSimplex = a + v * ab;
result.m_usedVertices.usedVertexA = true;
result.m_usedVertices.usedVertexB = true;
result.setBarycentricCoordinates(1-v,v,0);
result.setBarycentricCoordinates(1 - v, v, 0);
return true;
//return a + v * ab; // barycentric coordinates (1-v,v,0)
}
//return a + v * ab; // barycentric coordinates (1-v,v,0)
}
// Check if P in vertex region outside C
b3Vector3 cp = p - c;
b3Scalar d5 = ab.dot(cp);
b3Scalar d6 = ac.dot(cp);
if (d6 >= b3Scalar(0.0) && d5 <= d6)
// Check if P in vertex region outside C
b3Vector3 cp = p - c;
b3Scalar d5 = ab.dot(cp);
b3Scalar d6 = ac.dot(cp);
if (d6 >= b3Scalar(0.0) && d5 <= d6)
{
result.m_closestPointOnSimplex = c;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(0,0,1);
return true;//c; // barycentric coordinates (0,0,1)
result.setBarycentricCoordinates(0, 0, 1);
return true; //c; // barycentric coordinates (0,0,1)
}
// Check if P in edge region of AC, if so return projection of P onto AC
b3Scalar vb = d5*d2 - d1*d6;
if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0)) {
b3Scalar w = d2 / (d2 - d6);
// Check if P in edge region of AC, if so return projection of P onto AC
b3Scalar vb = d5 * d2 - d1 * d6;
if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0))
{
b3Scalar w = d2 / (d2 - d6);
result.m_closestPointOnSimplex = a + w * ac;
result.m_usedVertices.usedVertexA = true;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(1-w,0,w);
result.setBarycentricCoordinates(1 - w, 0, w);
return true;
//return a + w * ac; // barycentric coordinates (1-w,0,w)
}
//return a + w * ac; // barycentric coordinates (1-w,0,w)
}
// Check if P in edge region of BC, if so return projection of P onto BC
b3Scalar va = d3 * d6 - d5 * d4;
if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0))
{
b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
// Check if P in edge region of BC, if so return projection of P onto BC
b3Scalar va = d3*d6 - d5*d4;
if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0)) {
b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
result.m_closestPointOnSimplex = b + w * (c - b);
result.m_usedVertices.usedVertexB = true;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(0,1-w,w);
return true;
// return b + w * (c - b); // barycentric coordinates (0,1-w,w)
}
result.setBarycentricCoordinates(0, 1 - w, w);
return true;
// return b + w * (c - b); // barycentric coordinates (0,1-w,w)
}
// P inside face region. Compute Q through its barycentric coordinates (u,v,w)
b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
b3Scalar v = vb * denom;
b3Scalar w = vc * denom;
// P inside face region. Compute Q through its barycentric coordinates (u,v,w)
b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
b3Scalar v = vb * denom;
b3Scalar w = vc * denom;
result.m_closestPointOnSimplex = a + ab * v + ac * w;
result.m_usedVertices.usedVertexA = true;
result.m_usedVertices.usedVertexB = true;
result.m_usedVertices.usedVertexC = true;
result.setBarycentricCoordinates(1-v-w,v,w);
result.setBarycentricCoordinates(1 - v - w, v, w);
return true;
// return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
// return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
}
/// Test if point p and d lie on opposite sides of plane through abc
int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d)
{
b3Vector3 normal = (b-a).cross(c-a);
b3Vector3 normal = (b - a).cross(c - a);
b3Scalar signp = (p - a).dot(normal); // [AP AB AC]
b3Scalar signd = (d - a).dot( normal); // [AD AB AC]
b3Scalar signp = (p - a).dot(normal); // [AP AB AC]
b3Scalar signd = (d - a).dot(normal); // [AD AB AC]
#ifdef B3_CATCH_DEGENERATE_TETRAHEDRON
#ifdef BT_USE_DOUBLE_PRECISION
if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
{
return -1;
}
#else
if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4)))
{
// printf("affine dependent/degenerate\n");//
// printf("affine dependent/degenerate\n");//
return -1;
}
#endif
#endif
// Points on opposite sides if expression signs are opposite
return signp * signd < b3Scalar(0.);
return signp * signd < b3Scalar(0.);
}
bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
{
b3SubSimplexClosestResult tempResult;
// Start out assuming point inside all halfspaces, so closest to itself
// Start out assuming point inside all halfspaces, so closest to itself
finalResult.m_closestPointOnSimplex = p;
finalResult.m_usedVertices.reset();
finalResult.m_usedVertices.usedVertexA = true;
finalResult.m_usedVertices.usedVertexA = true;
finalResult.m_usedVertices.usedVertexB = true;
finalResult.m_usedVertices.usedVertexC = true;
finalResult.m_usedVertices.usedVertexD = true;
int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b);
int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
{
finalResult.m_degenerate = true;
return false;
}
if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
{
return false;
}
b3Scalar bestSqDist = FLT_MAX;
// If point outside face abc then compute closest point on abc
if (pointOutsideABC)
if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
{
closestPtPointTriangle(p, a, b, c,tempResult);
finalResult.m_degenerate = true;
return false;
}
if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
{
return false;
}
b3Scalar bestSqDist = FLT_MAX;
// If point outside face abc then compute closest point on abc
if (pointOutsideABC)
{
closestPtPointTriangle(p, a, b, c, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
b3Scalar sqDist = (q - p).dot( q - p);
// Update best closest point if (squared) distance is less than current best
if (sqDist < bestSqDist) {
b3Scalar sqDist = (q - p).dot(q - p);
// Update best closest point if (squared) distance is less than current best
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
//convert result bitmask!
@@ -501,25 +478,22 @@ bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const
finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB;
finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
finalResult.setBarycentricCoordinates(
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTB],
tempResult.m_barycentricCoords[VERTC],
0
);
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTB],
tempResult.m_barycentricCoords[VERTC],
0);
}
}
}
// Repeat test for face acd
if (pointOutsideACD)
if (pointOutsideACD)
{
closestPtPointTriangle(p, a, c, d,tempResult);
closestPtPointTriangle(p, a, c, d, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
//convert result bitmask!
b3Scalar sqDist = (q - p).dot( q - p);
if (sqDist < bestSqDist)
b3Scalar sqDist = (q - p).dot(q - p);
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
@@ -529,52 +503,46 @@ bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const
finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB;
finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC;
finalResult.setBarycentricCoordinates(
tempResult.m_barycentricCoords[VERTA],
0,
tempResult.m_barycentricCoords[VERTB],
tempResult.m_barycentricCoords[VERTC]
);
tempResult.m_barycentricCoords[VERTA],
0,
tempResult.m_barycentricCoords[VERTB],
tempResult.m_barycentricCoords[VERTC]);
}
}
// Repeat test for face adb
}
// Repeat test for face adb
if (pointOutsideADB)
{
closestPtPointTriangle(p, a, d, b,tempResult);
closestPtPointTriangle(p, a, d, b, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
//convert result bitmask!
b3Scalar sqDist = (q - p).dot( q - p);
if (sqDist < bestSqDist)
b3Scalar sqDist = (q - p).dot(q - p);
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
finalResult.m_usedVertices.reset();
finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC;
finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
finalResult.setBarycentricCoordinates(
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTC],
0,
tempResult.m_barycentricCoords[VERTB]
);
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTC],
0,
tempResult.m_barycentricCoords[VERTB]);
}
}
// Repeat test for face bdc
}
// Repeat test for face bdc
if (pointOutsideBDC)
{
closestPtPointTriangle(p, b, d, c,tempResult);
closestPtPointTriangle(p, b, d, c, tempResult);
b3Vector3 q = tempResult.m_closestPointOnSimplex;
//convert result bitmask!
b3Scalar sqDist = (q - p).dot( q - p);
if (sqDist < bestSqDist)
b3Scalar sqDist = (q - p).dot(q - p);
if (sqDist < bestSqDist)
{
bestSqDist = sqDist;
finalResult.m_closestPointOnSimplex = q;
@@ -585,25 +553,22 @@ bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const
finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
finalResult.setBarycentricCoordinates(
0,
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTC],
tempResult.m_barycentricCoords[VERTB]
);
0,
tempResult.m_barycentricCoords[VERTA],
tempResult.m_barycentricCoords[VERTC],
tempResult.m_barycentricCoords[VERTB]);
}
}
}
//help! we ended up full !
if (finalResult.m_usedVertices.usedVertexA &&
finalResult.m_usedVertices.usedVertexB &&
finalResult.m_usedVertices.usedVertexC &&
finalResult.m_usedVertices.usedVertexD)
finalResult.m_usedVertices.usedVertexD)
{
return true;
}
return true;
return true;
}

View File

@@ -13,22 +13,19 @@ subject to the following restrictions:
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_VORONOI_SIMPLEX_SOLVER_H
#define B3_VORONOI_SIMPLEX_SOLVER_H
#include "Bullet3Common/b3Vector3.h"
#define VORONOI_SIMPLEX_MAX_VERTS 5
///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure
//#define BT_USE_EQUAL_VERTEX_THRESHOLD
#define VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD 0.0001f
struct b3UsageBitfield{
struct b3UsageBitfield
{
b3UsageBitfield()
{
reset();
@@ -41,137 +38,127 @@ struct b3UsageBitfield{
usedVertexC = false;
usedVertexD = false;
}
unsigned short usedVertexA : 1;
unsigned short usedVertexB : 1;
unsigned short usedVertexC : 1;
unsigned short usedVertexD : 1;
unsigned short unused1 : 1;
unsigned short unused2 : 1;
unsigned short unused3 : 1;
unsigned short unused4 : 1;
unsigned short usedVertexA : 1;
unsigned short usedVertexB : 1;
unsigned short usedVertexC : 1;
unsigned short usedVertexD : 1;
unsigned short unused1 : 1;
unsigned short unused2 : 1;
unsigned short unused3 : 1;
unsigned short unused4 : 1;
};
struct b3SubSimplexClosestResult
struct b3SubSimplexClosestResult
{
b3Vector3 m_closestPointOnSimplex;
b3Vector3 m_closestPointOnSimplex;
//MASK for m_usedVertices
//stores the simplex vertex-usage, using the MASK,
//stores the simplex vertex-usage, using the MASK,
// if m_usedVertices & MASK then the related vertex is used
b3UsageBitfield m_usedVertices;
b3Scalar m_barycentricCoords[4];
b3UsageBitfield m_usedVertices;
b3Scalar m_barycentricCoords[4];
bool m_degenerate;
void reset()
void reset()
{
m_degenerate = false;
setBarycentricCoordinates();
m_usedVertices.reset();
}
bool isValid()
bool isValid()
{
bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) &&
(m_barycentricCoords[1] >= b3Scalar(0.)) &&
(m_barycentricCoords[2] >= b3Scalar(0.)) &&
(m_barycentricCoords[3] >= b3Scalar(0.));
(m_barycentricCoords[1] >= b3Scalar(0.)) &&
(m_barycentricCoords[2] >= b3Scalar(0.)) &&
(m_barycentricCoords[3] >= b3Scalar(0.));
return valid;
}
void setBarycentricCoordinates(b3Scalar a=b3Scalar(0.),b3Scalar b=b3Scalar(0.),b3Scalar c=b3Scalar(0.),b3Scalar d=b3Scalar(0.))
void setBarycentricCoordinates(b3Scalar a = b3Scalar(0.), b3Scalar b = b3Scalar(0.), b3Scalar c = b3Scalar(0.), b3Scalar d = b3Scalar(0.))
{
m_barycentricCoords[0] = a;
m_barycentricCoords[1] = b;
m_barycentricCoords[2] = c;
m_barycentricCoords[3] = d;
}
};
/// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
/// Can be used with GJK, as an alternative to Johnson distance algorithm.
B3_ATTRIBUTE_ALIGNED16(class) b3VoronoiSimplexSolver
B3_ATTRIBUTE_ALIGNED16(class)
b3VoronoiSimplexSolver
{
public:
B3_DECLARE_ALIGNED_ALLOCATOR();
int m_numVertices;
int m_numVertices;
b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
b3Vector3 m_cachedP1;
b3Vector3 m_cachedP2;
b3Vector3 m_cachedV;
b3Vector3 m_lastW;
b3Scalar m_equalVertexThreshold;
bool m_cachedValidClosest;
b3Vector3 m_cachedP1;
b3Vector3 m_cachedP2;
b3Vector3 m_cachedV;
b3Vector3 m_lastW;
b3Scalar m_equalVertexThreshold;
bool m_cachedValidClosest;
b3SubSimplexClosestResult m_cachedBC;
bool m_needsUpdate;
void removeVertex(int index);
void reduceVertices (const b3UsageBitfield& usedVerts);
bool updateClosestVectorAndPoints();
bool m_needsUpdate;
bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result);
void removeVertex(int index);
void reduceVertices(const b3UsageBitfield& usedVerts);
bool updateClosestVectorAndPoints();
bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result);
public:
b3VoronoiSimplexSolver()
: m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD)
: m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD)
{
}
void reset();
void reset();
void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);
void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);
void setEqualVertexThreshold(b3Scalar threshold)
{
m_equalVertexThreshold = threshold;
}
void setEqualVertexThreshold(b3Scalar threshold)
{
m_equalVertexThreshold = threshold;
}
b3Scalar getEqualVertexThreshold() const
{
return m_equalVertexThreshold;
}
b3Scalar getEqualVertexThreshold() const
{
return m_equalVertexThreshold;
}
bool closest(b3Vector3& v);
bool closest(b3Vector3 & v);
b3Scalar maxVertex();
b3Scalar maxVertex();
bool fullSimplex() const
{
return (m_numVertices == 4);
}
bool fullSimplex() const
{
return (m_numVertices == 4);
}
int getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const;
int getSimplex(b3Vector3 * pBuf, b3Vector3 * qBuf, b3Vector3 * yBuf) const;
bool inSimplex(const b3Vector3& w);
void backup_closest(b3Vector3& v) ;
bool inSimplex(const b3Vector3& w);
bool emptySimplex() const ;
void backup_closest(b3Vector3 & v);
void compute_points(b3Vector3& p1, b3Vector3& p2) ;
int numVertices() const
{
return m_numVertices;
}
bool emptySimplex() const;
void compute_points(b3Vector3 & p1, b3Vector3 & p2);
int numVertices() const
{
return m_numVertices;
}
};
#endif //B3_VORONOI_SIMPLEX_SOLVER_H
#endif //B3_VORONOI_SIMPLEX_SOLVER_H

View File

@@ -1,258 +1,257 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* bvhTraversalKernelCL= \
"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
"//written by Erwin Coumans\n"
"#define SHAPE_CONVEX_HULL 3\n"
"#define SHAPE_CONCAVE_TRIMESH 5\n"
"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
"#define SHAPE_SPHERE 7\n"
"typedef unsigned int u32;\n"
"#define MAX_NUM_PARTS_IN_BITS 10\n"
"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes\n"
" int m_escapeIndexOrTriangleIndex;\n"
"} btQuantizedBvhNode;\n"
"typedef struct\n"
"{\n"
" float4 m_aabbMin;\n"
" float4 m_aabbMax;\n"
" float4 m_quantization;\n"
" int m_numNodes;\n"
" int m_numSubTrees;\n"
" int m_nodeOffset;\n"
" int m_subTreeOffset;\n"
"} b3BvhInfo;\n"
"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" unsigned int x=0;\n"
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
" // Get only the lower bits where the triangle index is stored\n"
" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
"}\n"
"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
"}\n"
" \n"
"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" return -rootNode->m_escapeIndexOrTriangleIndex;\n"
"}\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes, points to the root of the subtree\n"
" int m_rootNodeIndex;\n"
" //4 bytes\n"
" int m_subtreeSize;\n"
" int m_padding[3];\n"
"} btBvhSubtreeInfo;\n"
"///keep this in sync with btCollidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
" int blaat2;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} btCollidableGpu;\n"
"typedef struct\n"
"{\n"
" float4 m_childPosition;\n"
" float4 m_childOrientation;\n"
" int m_shapeIndex;\n"
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} btGpuChildShape;\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" u32 m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} BodyData;\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"int testQuantizedAabbAgainstQuantizedAabb(\n"
" const unsigned short int* aabbMin1,\n"
" const unsigned short int* aabbMax1,\n"
" const unsigned short int* aabbMin2,\n"
" const unsigned short int* aabbMax2)\n"
"{\n"
" //int overlap = 1;\n"
" if (aabbMin1[0] > aabbMax2[0])\n"
" return 0;\n"
" if (aabbMax1[0] < aabbMin2[0])\n"
" return 0;\n"
" if (aabbMin1[1] > aabbMax2[1])\n"
" return 0;\n"
" if (aabbMax1[1] < aabbMin2[1])\n"
" return 0;\n"
" if (aabbMin1[2] > aabbMax2[2])\n"
" return 0;\n"
" if (aabbMax1[2] < aabbMin2[2])\n"
" return 0;\n"
" return 1;\n"
" //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
" //return overlap;\n"
"}\n"
"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
"{\n"
" float4 clampedPoint = max(point2,bvhAabbMin);\n"
" clampedPoint = min (clampedPoint, bvhAabbMax);\n"
" float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
" if (isMax)\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
" } else\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
" }\n"
"}\n"
"// work-in-progress\n"
"__kernel void bvhTraversalKernel( __global const int4* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const btCollidableGpu* collidables,\n"
" __global btAabbCL* aabbs,\n"
" __global int4* concavePairsOut,\n"
" __global volatile int* numConcavePairsOut,\n"
" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
" __global const btQuantizedBvhNode* quantizedNodesRoot,\n"
" __global const b3BvhInfo* bvhInfos,\n"
" int numPairs,\n"
" int maxNumConcavePairsCapacity)\n"
"{\n"
" int id = get_global_id(0);\n"
" if (id>=numPairs)\n"
" return;\n"
" \n"
" int bodyIndexA = pairs[id].x;\n"
" int bodyIndexB = pairs[id].y;\n"
" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
" \n"
" //once the broadphase avoids static-static pairs, we can remove this test\n"
" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
" {\n"
" return;\n"
" }\n"
" \n"
" if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
" return;\n"
" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
" \n"
" if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
" shapeTypeB!=SHAPE_SPHERE &&\n"
" shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
" )\n"
" return;\n"
" b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
" float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
" float4 bvhQuantization = bvhInfo.m_quantization;\n"
" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
" \n"
" unsigned short int quantizedQueryAabbMin[3];\n"
" unsigned short int quantizedQueryAabbMax[3];\n"
" quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" \n"
" for (int i=0;i<numSubtreeHeaders;i++)\n"
" {\n"
" btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
" \n"
" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
" if (overlap != 0)\n"
" {\n"
" int startNodeIndex = subtree.m_rootNodeIndex;\n"
" int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
" int curIndex = startNodeIndex;\n"
" int escapeIndex;\n"
" int isLeafNode;\n"
" int aabbOverlap;\n"
" while (curIndex < endNodeIndex)\n"
" {\n"
" btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
" isLeafNode = isLeaf(&rootNode);\n"
" if (aabbOverlap)\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" int triangleIndex = getTriangleIndex(&rootNode);\n"
" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
" {\n"
" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
" int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
" for (int b=0;b<numChildrenB;b++)\n"
" {\n"
" if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
" {\n"
" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
" concavePairsOut[pairIdx+b] = newPair;\n"
" }\n"
" }\n"
" } else\n"
" {\n"
" int pairIdx = atomic_inc(numConcavePairsOut);\n"
" if (pairIdx<maxNumConcavePairsCapacity)\n"
" {\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
" concavePairsOut[pairIdx] = newPair;\n"
" }\n"
" }\n"
" } \n"
" curIndex++;\n"
" } else\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" curIndex++;\n"
" } else\n"
" {\n"
" escapeIndex = getEscapeIndex(&rootNode);\n"
" curIndex += escapeIndex;\n"
" }\n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n"
;
static const char* bvhTraversalKernelCL =
"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
"//written by Erwin Coumans\n"
"#define SHAPE_CONVEX_HULL 3\n"
"#define SHAPE_CONCAVE_TRIMESH 5\n"
"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
"#define SHAPE_SPHERE 7\n"
"typedef unsigned int u32;\n"
"#define MAX_NUM_PARTS_IN_BITS 10\n"
"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes\n"
" int m_escapeIndexOrTriangleIndex;\n"
"} btQuantizedBvhNode;\n"
"typedef struct\n"
"{\n"
" float4 m_aabbMin;\n"
" float4 m_aabbMax;\n"
" float4 m_quantization;\n"
" int m_numNodes;\n"
" int m_numSubTrees;\n"
" int m_nodeOffset;\n"
" int m_subTreeOffset;\n"
"} b3BvhInfo;\n"
"int getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" unsigned int x=0;\n"
" unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
" // Get only the lower bits where the triangle index is stored\n"
" return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
"}\n"
"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" //skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
" return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
"}\n"
" \n"
"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
"{\n"
" return -rootNode->m_escapeIndexOrTriangleIndex;\n"
"}\n"
"typedef struct\n"
"{\n"
" //12 bytes\n"
" unsigned short int m_quantizedAabbMin[3];\n"
" unsigned short int m_quantizedAabbMax[3];\n"
" //4 bytes, points to the root of the subtree\n"
" int m_rootNodeIndex;\n"
" //4 bytes\n"
" int m_subtreeSize;\n"
" int m_padding[3];\n"
"} btBvhSubtreeInfo;\n"
"///keep this in sync with btCollidable.h\n"
"typedef struct\n"
"{\n"
" int m_numChildShapes;\n"
" int blaat2;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
" \n"
"} btCollidableGpu;\n"
"typedef struct\n"
"{\n"
" float4 m_childPosition;\n"
" float4 m_childOrientation;\n"
" int m_shapeIndex;\n"
" int m_unused0;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"} btGpuChildShape;\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" u32 m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} BodyData;\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"int testQuantizedAabbAgainstQuantizedAabb(\n"
" const unsigned short int* aabbMin1,\n"
" const unsigned short int* aabbMax1,\n"
" const unsigned short int* aabbMin2,\n"
" const unsigned short int* aabbMax2)\n"
"{\n"
" //int overlap = 1;\n"
" if (aabbMin1[0] > aabbMax2[0])\n"
" return 0;\n"
" if (aabbMax1[0] < aabbMin2[0])\n"
" return 0;\n"
" if (aabbMin1[1] > aabbMax2[1])\n"
" return 0;\n"
" if (aabbMax1[1] < aabbMin2[1])\n"
" return 0;\n"
" if (aabbMin1[2] > aabbMax2[2])\n"
" return 0;\n"
" if (aabbMax1[2] < aabbMin2[2])\n"
" return 0;\n"
" return 1;\n"
" //overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
" //overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
" //return overlap;\n"
"}\n"
"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
"{\n"
" float4 clampedPoint = max(point2,bvhAabbMin);\n"
" clampedPoint = min (clampedPoint, bvhAabbMax);\n"
" float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
" if (isMax)\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
" } else\n"
" {\n"
" out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
" out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
" out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
" }\n"
"}\n"
"// work-in-progress\n"
"__kernel void bvhTraversalKernel( __global const int4* pairs, \n"
" __global const BodyData* rigidBodies, \n"
" __global const btCollidableGpu* collidables,\n"
" __global btAabbCL* aabbs,\n"
" __global int4* concavePairsOut,\n"
" __global volatile int* numConcavePairsOut,\n"
" __global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
" __global const btQuantizedBvhNode* quantizedNodesRoot,\n"
" __global const b3BvhInfo* bvhInfos,\n"
" int numPairs,\n"
" int maxNumConcavePairsCapacity)\n"
"{\n"
" int id = get_global_id(0);\n"
" if (id>=numPairs)\n"
" return;\n"
" \n"
" int bodyIndexA = pairs[id].x;\n"
" int bodyIndexB = pairs[id].y;\n"
" int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
" int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
" \n"
" //once the broadphase avoids static-static pairs, we can remove this test\n"
" if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
" {\n"
" return;\n"
" }\n"
" \n"
" if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
" return;\n"
" int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
" \n"
" if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
" shapeTypeB!=SHAPE_SPHERE &&\n"
" shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
" )\n"
" return;\n"
" b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
" float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
" float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
" float4 bvhQuantization = bvhInfo.m_quantization;\n"
" int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
" __global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
" __global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
" \n"
" unsigned short int quantizedQueryAabbMin[3];\n"
" unsigned short int quantizedQueryAabbMax[3];\n"
" quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
" \n"
" for (int i=0;i<numSubtreeHeaders;i++)\n"
" {\n"
" btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
" \n"
" int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
" if (overlap != 0)\n"
" {\n"
" int startNodeIndex = subtree.m_rootNodeIndex;\n"
" int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
" int curIndex = startNodeIndex;\n"
" int escapeIndex;\n"
" int isLeafNode;\n"
" int aabbOverlap;\n"
" while (curIndex < endNodeIndex)\n"
" {\n"
" btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
" aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
" isLeafNode = isLeaf(&rootNode);\n"
" if (aabbOverlap)\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" int triangleIndex = getTriangleIndex(&rootNode);\n"
" if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
" {\n"
" int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
" int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
" for (int b=0;b<numChildrenB;b++)\n"
" {\n"
" if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
" {\n"
" int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
" concavePairsOut[pairIdx+b] = newPair;\n"
" }\n"
" }\n"
" } else\n"
" {\n"
" int pairIdx = atomic_inc(numConcavePairsOut);\n"
" if (pairIdx<maxNumConcavePairsCapacity)\n"
" {\n"
" int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
" concavePairsOut[pairIdx] = newPair;\n"
" }\n"
" }\n"
" } \n"
" curIndex++;\n"
" } else\n"
" {\n"
" if (isLeafNode)\n"
" {\n"
" curIndex++;\n"
" } else\n"
" {\n"
" escapeIndex = getEscapeIndex(&rootNode);\n"
" curIndex += escapeIndex;\n"
" }\n"
" }\n"
" }\n"
" }\n"
" }\n"
"}\n";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -19,149 +19,139 @@ subject to the following restrictions:
#define KERNEL1 "SearchSortDataUpperKernel"
#define KERNEL2 "SubtractKernel"
#include "b3BoundSearchCL.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "b3LauncherCL.h"
#include "kernels/BoundSearchKernelsCL.h"
b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
:m_context(ctx),
m_device(device),
m_queue(queue)
: m_context(ctx),
m_device(device),
m_queue(queue)
{
const char* additionalMacros = "";
//const char* srcFileNameForCaching="";
cl_int pErrNum;
const char* kernelSource = boundSearchKernelsCL;
cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, BOUNDSEARCH_PATH);
b3Assert(boundSearchProg);
m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
b3Assert(m_lowerSortDataKernel );
m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg, additionalMacros);
b3Assert(m_lowerSortDataKernel);
m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
m_upperSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg, additionalMacros);
b3Assert(m_upperSortDataKernel);
m_subtractKernel = 0;
if( maxSize )
if (maxSize)
{
m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
m_subtractKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg, additionalMacros);
b3Assert(m_subtractKernel);
}
//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize );
m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize );
m_filler = new b3FillCL(ctx,device,queue);
m_lower = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
m_upper = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
m_filler = new b3FillCL(ctx, device, queue);
}
b3BoundSearchCL::~b3BoundSearchCL()
{
delete m_lower;
delete m_upper;
delete m_filler;
clReleaseKernel(m_lowerSortDataKernel);
clReleaseKernel(m_upperSortDataKernel);
clReleaseKernel(m_subtractKernel);
}
void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option )
void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option)
{
b3Int4 constBuffer;
constBuffer.x = nSrc;
constBuffer.y = nDst;
if( option == BOUND_LOWER )
if (option == BOUND_LOWER)
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) };
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
launcher.launch1D( nSrc, 64 );
b3LauncherCL launcher(m_queue, m_lowerSortDataKernel, "m_lowerSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(nSrc);
launcher.setConst(nDst);
launcher.launch1D(nSrc, 64);
}
else if( option == BOUND_UPPER )
else if (option == BOUND_UPPER)
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
b3LauncherCL launcher(m_queue, m_upperSortDataKernel, "m_upperSortDataKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(nSrc);
launcher.setConst(nDst);
launcher.launch1D( nSrc, 64 );
launcher.launch1D(nSrc, 64);
}
else if( option == COUNT )
else if (option == COUNT)
{
b3Assert( m_lower );
b3Assert( m_upper );
b3Assert( m_lower->capacity() <= (int)nDst );
b3Assert( m_upper->capacity() <= (int)nDst );
b3Assert(m_lower);
b3Assert(m_upper);
b3Assert(m_lower->capacity() <= (int)nDst);
b3Assert(m_upper->capacity() <= (int)nDst);
int zero = 0;
m_filler->execute( *m_lower, zero, nDst );
m_filler->execute( *m_upper, zero, nDst );
m_filler->execute(*m_lower, zero, nDst);
m_filler->execute(*m_upper, zero, nDst);
execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
execute(src, nSrc, *m_lower, nDst, BOUND_LOWER);
execute(src, nSrc, *m_upper, nDst, BOUND_UPPER);
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_upper->getBufferCL(), true), b3BufferInfoCL(m_lower->getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};
b3LauncherCL launcher( m_queue, m_subtractKernel ,"m_subtractKernel");
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( nSrc );
launcher.setConst( nDst );
b3LauncherCL launcher(m_queue, m_subtractKernel, "m_subtractKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(nSrc);
launcher.setConst(nDst);
launcher.launch1D( nDst, 64 );
launcher.launch1D(nDst, 64);
}
}
else
{
b3Assert( 0 );
b3Assert(0);
}
}
void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc,
b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option )
void b3BoundSearchCL::executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc,
b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option)
{
for (int i = 0; i < nSrc - 1; i++)
b3Assert(src[i].m_key <= src[i + 1].m_key);
for(int i=0; i<nSrc-1; i++)
b3Assert( src[i].m_key <= src[i+1].m_key );
b3SortData minData,zeroData,maxData;
b3SortData minData, zeroData, maxData;
minData.m_key = -1;
minData.m_value = -1;
zeroData.m_key=0;
zeroData.m_value=0;
zeroData.m_key = 0;
zeroData.m_value = 0;
maxData.m_key = nDst;
maxData.m_value = nDst;
if( option == BOUND_LOWER )
if (option == BOUND_LOWER)
{
for(int i=0; i<nSrc; i++)
for (int i = 0; i < nSrc; i++)
{
b3SortData& iData = (i==0)? minData: src[i-1];
b3SortData& jData = (i==nSrc)? maxData: src[i];
b3SortData& iData = (i == 0) ? minData : src[i - 1];
b3SortData& jData = (i == nSrc) ? maxData : src[i];
if( iData.m_key != jData.m_key )
if (iData.m_key != jData.m_key)
{
int k = jData.m_key;
{
@@ -170,14 +160,14 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
}
}
}
else if( option == BOUND_UPPER )
else if (option == BOUND_UPPER)
{
for(int i=1; i<nSrc+1; i++)
for (int i = 1; i < nSrc + 1; i++)
{
b3SortData& iData = src[i-1];
b3SortData& jData = (i==nSrc)? maxData: src[i];
b3SortData& iData = src[i - 1];
b3SortData& jData = (i == nSrc) ? maxData : src[i];
if( iData.m_key != jData.m_key )
if (iData.m_key != jData.m_key)
{
int k = iData.m_key;
{
@@ -186,28 +176,28 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
}
}
}
else if( option == COUNT )
else if (option == COUNT)
{
b3AlignedObjectArray<unsigned int> lower;
lower.resize(nDst );
lower.resize(nDst);
b3AlignedObjectArray<unsigned int> upper;
upper.resize(nDst );
upper.resize(nDst);
for(int i=0; i<nDst; i++)
{
lower[i] = upper[i] = 0;
for (int i = 0; i < nDst; i++)
{
lower[i] = upper[i] = 0;
}
executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
executeHost(src, nSrc, lower, nDst, BOUND_LOWER);
executeHost(src, nSrc, upper, nDst, BOUND_UPPER);
for( int i=0; i<nDst; i++)
{
dst[i] = upper[i] - lower[i];
for (int i = 0; i < nDst; i++)
{
dst[i] = upper[i] - lower[i];
}
}
else
{
b3Assert( 0 );
b3Assert(0);
}
}

View File

@@ -26,42 +26,39 @@ subject to the following restrictions:
#include "b3OpenCLArray.h"
#include "b3FillCL.h"
#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
class b3BoundSearchCL
{
public:
public:
enum Option
{
BOUND_LOWER,
BOUND_UPPER,
COUNT,
};
enum Option
{
BOUND_LOWER,
BOUND_UPPER,
COUNT,
};
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_lowerSortDataKernel;
cl_kernel m_upperSortDataKernel;
cl_kernel m_subtractKernel;
cl_kernel m_lowerSortDataKernel;
cl_kernel m_upperSortDataKernel;
cl_kernel m_subtractKernel;
b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
b3OpenCLArray<unsigned int>* m_lower;
b3OpenCLArray<unsigned int>* m_upper;
b3FillCL* m_filler;
b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
b3OpenCLArray<unsigned int>* m_lower;
b3OpenCLArray<unsigned int>* m_upper;
virtual ~b3BoundSearchCL();
b3FillCL* m_filler;
// src has to be src[i].m_key <= src[i+1].m_key
void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
virtual ~b3BoundSearchCL();
// src has to be src[i].m_key <= src[i+1].m_key
void execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
void executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
};
#endif //B3_BOUNDSEARCH_H
#endif //B3_BOUNDSEARCH_H

View File

@@ -4,16 +4,15 @@
#include "b3OpenCLArray.h"
struct b3BufferInfoCL
{
//b3BufferInfoCL(){}
// template<typename T>
b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
// template<typename T>
b3BufferInfoCL(cl_mem buff, bool isReadOnly = false) : m_clBuffer(buff), m_isReadOnly(isReadOnly) {}
cl_mem m_clBuffer;
bool m_isReadOnly;
};
#endif //B3_BUFFER_INFO_CL_H
#endif //B3_BUFFER_INFO_CL_H

View File

@@ -8,29 +8,26 @@
#include "kernels/FillKernelsCL.h"
b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
:m_commandQueue(queue)
: m_commandQueue(queue)
{
const char* kernelSource = fillKernelsCL;
cl_int pErrNum;
const char* additionalMacros = "";
cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, FILL_CL_PROGRAM_PATH);
b3Assert(fillProg);
m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillIntKernel);
m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillIntKernel);
m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillFloatKernel);
m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg, additionalMacros);
b3Assert(m_fillKernelInt2);
}
b3FillCL::~b3FillCL()
@@ -39,88 +36,84 @@ b3FillCL::~b3FillCL()
clReleaseKernel(m_fillIntKernel);
clReleaseKernel(m_fillUnsignedIntKernel);
clReleaseKernel(m_fillFloatKernel);
}
void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
{
b3Assert( n>0 );
b3Assert(n > 0);
{
b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" );
launcher.setBuffer( src.getBufferCL());
launcher.setConst( n );
launcher.setConst( value );
launcher.setConst( offset);
b3LauncherCL launcher(m_commandQueue, m_fillFloatKernel, "m_fillFloatKernel");
launcher.setBuffer(src.getBufferCL());
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D( n );
launcher.launch1D(n);
}
}
void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
{
b3Assert( n>0 );
b3Assert(n > 0);
{
b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel");
b3LauncherCL launcher(m_commandQueue, m_fillIntKernel, "m_fillIntKernel");
launcher.setBuffer(src.getBufferCL());
launcher.setConst( n);
launcher.setConst( value);
launcher.setConst( offset);
launcher.launch1D( n );
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D(n);
}
}
void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
{
b3Assert( n>0 );
b3Assert(n > 0);
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( n );
launcher.setConst(value);
b3LauncherCL launcher(m_commandQueue, m_fillUnsignedIntKernel, "m_fillUnsignedIntKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
launcher.launch1D( n );
launcher.launch1D(n);
}
}
void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset)
{
for (int i=0;i<n;i++)
for (int i = 0; i < n; i++)
{
src[i+offset]=value;
src[i + offset] = value;
}
}
void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
void b3FillCL::executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset)
{
for (int i=0;i<n;i++)
for (int i = 0; i < n; i++)
{
src[i+offset]=value;
src[i + offset] = value;
}
}
void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
void b3FillCL::execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset)
{
b3Assert( n>0 );
b3Assert(n > 0);
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2");
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2, "m_fillKernelInt2");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(n);
launcher.setConst(value);
launcher.setConst(offset);
//( constBuffer );
launcher.launch1D( n );
launcher.launch1D(n);
}
}

View File

@@ -7,57 +7,46 @@
#include "Bullet3Common/shared/b3Int2.h"
#include "Bullet3Common/shared/b3Int4.h"
class b3FillCL
{
cl_command_queue m_commandQueue;
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
cl_command_queue m_commandQueue;
public:
struct b3ConstData
{
union
{
b3Int4 m_data;
b3UnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
protected:
cl_kernel m_fillKernelInt2;
cl_kernel m_fillIntKernel;
cl_kernel m_fillUnsignedIntKernel;
cl_kernel m_fillFloatKernel;
public:
struct b3ConstData
{
union {
b3Int4 m_data;
b3UnsignedInt4 m_UnsignedData;
};
int m_offset;
int m_n;
int m_padding[2];
};
b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
protected:
public:
b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
virtual ~b3FillCL();
virtual ~b3FillCL();
void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset);
void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
void executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset);
void executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset);
// void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
};
#endif //B3_FILL_CL_H
#endif //B3_FILL_CL_H

View File

@@ -1,13 +1,13 @@
#include "b3LauncherCL.h"
bool gDebugLauncherCL = false;
b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name)
:m_commandQueue(queue),
m_kernel(kernel),
m_idx(0),
m_enableSerialization(false),
m_name(name)
: m_commandQueue(queue),
m_kernel(kernel),
m_idx(0),
m_enableSerialization(false),
m_name(name)
{
if (gDebugLauncherCL)
{
@@ -15,59 +15,58 @@ m_name(name)
printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name);
}
m_serializationSizeInBytes = sizeof(int);
m_serializationSizeInBytes = sizeof(int);
}
b3LauncherCL::~b3LauncherCL()
{
for (int i=0;i<m_arrays.size();i++)
{
delete (m_arrays[i]);
}
m_arrays.clear();
if (gDebugLauncherCL)
{
static int counter = 0;
printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name);
}
}
void b3LauncherCL::setBuffer( cl_mem clBuffer)
{
if (m_enableSerialization)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert( err == CL_SUCCESS );
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(b3KernelArgData);
m_serializationSizeInBytes+=param_value;
}
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
b3Assert( status == CL_SUCCESS );
for (int i = 0; i < m_arrays.size(); i++)
{
delete (m_arrays[i]);
}
m_arrays.clear();
if (gDebugLauncherCL)
{
static int counter = 0;
printf("[%d] Finished launching OpenCL kernel %s\n", counter++, m_name);
}
}
void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
void b3LauncherCL::setBuffer(cl_mem clBuffer)
{
for(int i=0; i<n; i++)
if (m_enableSerialization)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo(kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert(err == CL_SUCCESS);
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes += sizeof(b3KernelArgData);
m_serializationSizeInBytes += param_value;
}
cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
b3Assert(status == CL_SUCCESS);
}
void b3LauncherCL::setBuffers(b3BufferInfoCL* buffInfo, int n)
{
for (int i = 0; i < n; i++)
{
if (m_enableSerialization)
{
@@ -75,106 +74,103 @@ void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 1;
kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
cl_mem_info param_name = CL_MEM_SIZE;
size_t param_value;
size_t sizeInBytes = sizeof(size_t);
size_t actualSizeInBytes;
cl_int err;
err = clGetMemObjectInfo ( kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert( err == CL_SUCCESS );
err = clGetMemObjectInfo(kernelArg.m_clBuffer,
param_name,
sizeInBytes,
&param_value,
&actualSizeInBytes);
b3Assert(err == CL_SUCCESS);
kernelArg.m_argSizeInBytes = param_value;
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+= sizeof(b3KernelArgData);
m_serializationSizeInBytes+=param_value;
}
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
b3Assert( status == CL_SUCCESS );
}
m_serializationSizeInBytes += sizeof(b3KernelArgData);
m_serializationSizeInBytes += param_value;
}
cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
b3Assert(status == CL_SUCCESS);
}
}
struct b3KernelArgDataUnaligned
{
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
int m_unusedPadding;
union
{
cl_mem m_clBuffer;
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
union {
cl_mem m_clBuffer;
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
};
#include <string.h>
int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &buf[index];
index+=sizeof(int);
for (int i=0;i<numArguments;i++)
{
b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
int index = 0;
index+=sizeof(b3KernelArgData);
if (arg->m_isBuffer)
{
b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
clData->resize(arg->m_argSizeInBytes);
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
arg->m_clBuffer = clData->getBufferCL();
m_arrays.push_back(clData);
cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
b3Assert( status == CL_SUCCESS );
index+=arg->m_argSizeInBytes;
} else
{
cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
b3Assert( status == CL_SUCCESS );
}
int numArguments = *(int*)&buf[index];
index += sizeof(int);
for (int i = 0; i < numArguments; i++)
{
b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
index += sizeof(b3KernelArgData);
if (arg->m_isBuffer)
{
b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx, m_commandQueue, arg->m_argSizeInBytes);
clData->resize(arg->m_argSizeInBytes);
clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
arg->m_clBuffer = clData->getBufferCL();
m_arrays.push_back(clData);
cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
b3Assert(status == CL_SUCCESS);
index += arg->m_argSizeInBytes;
}
else
{
cl_int status = clSetKernelArg(m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
b3Assert(status == CL_SUCCESS);
}
b3KernelArgData b;
memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned));
m_kernelArguments.push_back(b);
}
m_serializationSizeInBytes = index;
return index;
memcpy(&b, arg, sizeof(b3KernelArgDataUnaligned));
m_kernelArguments.push_back(b);
}
m_serializationSizeInBytes = index;
return index;
}
int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
{
int index=0;
int numArguments = *(int*) &goldBuffer[index];
index+=sizeof(int);
{
int index = 0;
int numArguments = *(int*)&goldBuffer[index];
index += sizeof(int);
if (numArguments != m_kernelArguments.size())
{
printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
printf("failed validation: expected %d arguments, found %d\n", numArguments, m_kernelArguments.size());
return -1;
}
for (int ii=0;ii<numArguments;ii++)
{
b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
for (int ii = 0; ii < numArguments; ii++)
{
b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
{
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n", ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
return -2;
}
@@ -184,125 +180,117 @@ int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapac
if (expected != found)
{
printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
printf("failed validation: argument %d isBuffer expected: %d, found %d\n", ii, expected, found);
return -3;
}
}
index+=sizeof(b3KernelArgData);
index += sizeof(b3KernelArgData);
if (argGold->m_isBuffer)
{
unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
{
unsigned char* memBuf = (unsigned char*)malloc(m_kernelArguments[ii].m_argSizeInBytes);
unsigned char* goldBuf = &goldBuffer[index];
for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
for (int j = 0; j < m_kernelArguments[j].m_argSizeInBytes; j++)
{
memBuf[j] = 0xaa;
}
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
memBuf, 0,0,0 );
b3Assert( status==CL_SUCCESS );
clFinish(m_commandQueue);
status = clEnqueueReadBuffer(m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
memBuf, 0, 0, 0);
b3Assert(status == CL_SUCCESS);
clFinish(m_commandQueue);
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
{
int expected = goldBuf[b];
int found = memBuf[b];
if (expected != found)
{
printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
ii, b, expected, found);
return -4;
}
}
index+=argGold->m_argSizeInBytes;
} else
{
index += argGold->m_argSizeInBytes;
}
else
{
//compare content
for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
{
int expected = argGold->m_argData[b];
int found =m_kernelArguments[ii].m_argData[b];
int found = m_kernelArguments[ii].m_argData[b];
if (expected != found)
{
printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
ii, b, expected, found);
ii, b, expected, found);
return -5;
}
}
}
}
return index;
}
}
return index;
}
int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
{
//initialize to known values
for (int i=0;i<destBufferCapacity;i++)
destBuffer[i] = 0xec;
//initialize to known values
for (int i = 0; i < destBufferCapacity; i++)
destBuffer[i] = 0xec;
assert(destBufferCapacity>=m_serializationSizeInBytes);
//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
int numArguments = m_kernelArguments.size();
int curBufferSize = 0;
int* dest = (int*)&destBuffer[curBufferSize];
*dest = numArguments;
curBufferSize += sizeof(int);
for (int i=0;i<this->m_kernelArguments.size();i++)
{
b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize];
*arg = m_kernelArguments[i];
curBufferSize+=sizeof(b3KernelArgData);
if (arg->m_isBuffer==1)
{
//copy the OpenCL buffer content
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
&destBuffer[curBufferSize], 0,0,0 );
b3Assert( status==CL_SUCCESS );
clFinish(m_commandQueue);
curBufferSize+=arg->m_argSizeInBytes;
}
}
return curBufferSize;
assert(destBufferCapacity >= m_serializationSizeInBytes);
//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
int numArguments = m_kernelArguments.size();
int curBufferSize = 0;
int* dest = (int*)&destBuffer[curBufferSize];
*dest = numArguments;
curBufferSize += sizeof(int);
for (int i = 0; i < this->m_kernelArguments.size(); i++)
{
b3KernelArgData* arg = (b3KernelArgData*)&destBuffer[curBufferSize];
*arg = m_kernelArguments[i];
curBufferSize += sizeof(b3KernelArgData);
if (arg->m_isBuffer == 1)
{
//copy the OpenCL buffer content
cl_int status = 0;
status = clEnqueueReadBuffer(m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
&destBuffer[curBufferSize], 0, 0, 0);
b3Assert(status == CL_SUCCESS);
clFinish(m_commandQueue);
curBufferSize += arg->m_argSizeInBytes;
}
}
return curBufferSize;
}
void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems)
{
int num = numWorkItems;
int buffSize = getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
for (int i=0;i<buffSize+1;i++)
unsigned char* buf = new unsigned char[buffSize + sizeof(int)];
for (int i = 0; i < buffSize + 1; i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
// int actualWrite = serializeArguments(buf,buffSize);
// unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize]==0xff);//check for buffer overrun
// int actualWrite = serializeArguments(buf,buffSize);
// unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize] == 0xff); //check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen(fileName,"wb");
fwrite(buf,buffSize+sizeof(int),1,f);
FILE* f = fopen(fileName, "wb");
fwrite(buf, buffSize + sizeof(int), 1, f);
fclose(f);
delete[] buf;
}
}

View File

@@ -9,60 +9,57 @@
#define B3_DEBUG_SERIALIZE_CL
#ifdef _WIN32
#pragma warning(disable :4996)
#pragma warning(disable : 4996)
#endif
#define B3_CL_MAX_ARG_SIZE 16
B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData
B3_ATTRIBUTE_ALIGNED16(struct)
b3KernelArgData
{
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
int m_isBuffer;
int m_argIndex;
int m_argSizeInBytes;
int m_unusedPadding;
union
{
cl_mem m_clBuffer;
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
union {
cl_mem m_clBuffer;
unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
};
};
class b3LauncherCL
{
cl_command_queue m_commandQueue;
cl_kernel m_kernel;
int m_idx;
b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
int m_serializationSizeInBytes;
bool m_enableSerialization;
b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
int m_serializationSizeInBytes;
bool m_enableSerialization;
const char* m_name;
public:
b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays;
b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
virtual ~b3LauncherCL();
void setBuffer( cl_mem clBuffer);
public:
b3AlignedObjectArray<b3OpenCLArray<unsigned char>*> m_arrays;
void setBuffers( b3BufferInfoCL* buffInfo, int n );
int getSerializationBufferSize() const
{
return m_serializationSizeInBytes;
}
int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
virtual ~b3LauncherCL();
void setBuffer(cl_mem clBuffer);
void setBuffers(b3BufferInfoCL* buffInfo, int n);
int getSerializationBufferSize() const
{
return m_serializationSizeInBytes;
}
int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx);
int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
int getNumArguments() const
{
return m_kernelArguments.size();
@@ -75,61 +72,57 @@ class b3LauncherCL
void serializeToFile(const char* fileName, int numWorkItems);
template<typename T>
inline void setConst( const T& consts )
{
int sz=sizeof(T);
b3Assert(sz<=B3_CL_MAX_ARG_SIZE);
template <typename T>
inline void setConst(const T& consts)
{
int sz = sizeof(T);
b3Assert(sz <= B3_CL_MAX_ARG_SIZE);
if (m_enableSerialization)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 0;
T* destArg = (T*)kernelArg.m_argData;
*destArg = consts;
kernelArg.m_argSizeInBytes = sizeof(T);
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes+=sizeof(b3KernelArgData);
}
cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
b3Assert( status == CL_SUCCESS );
if (m_enableSerialization)
{
b3KernelArgData kernelArg;
kernelArg.m_argIndex = m_idx;
kernelArg.m_isBuffer = 0;
T* destArg = (T*)kernelArg.m_argData;
*destArg = consts;
kernelArg.m_argSizeInBytes = sizeof(T);
m_kernelArguments.push_back(kernelArg);
m_serializationSizeInBytes += sizeof(b3KernelArgData);
}
inline void launch1D( int numThreads, int localSize = 64)
{
launch2D( numThreads, 1, localSize, 1 );
}
cl_int status = clSetKernelArg(m_kernel, m_idx++, sz, &consts);
b3Assert(status == CL_SUCCESS);
}
inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
{
size_t gRange[3] = {1,1,1};
size_t lRange[3] = {1,1,1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
gRange[0] *= lRange[0];
gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
gRange[1] *= lRange[1];
inline void launch1D(int numThreads, int localSize = 64)
{
launch2D(numThreads, 1, localSize, 1);
}
cl_int status = clEnqueueNDRangeKernel( m_commandQueue,
m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
if (status != CL_SUCCESS)
{
printf("Error: OpenCL status = %d\n",status);
}
b3Assert( status == CL_SUCCESS );
inline void launch2D(int numThreadsX, int numThreadsY, int localSizeX, int localSizeY)
{
size_t gRange[3] = {1, 1, 1};
size_t lRange[3] = {1, 1, 1};
lRange[0] = localSizeX;
lRange[1] = localSizeY;
gRange[0] = b3Max((size_t)1, (numThreadsX / lRange[0]) + (!(numThreadsX % lRange[0]) ? 0 : 1));
gRange[0] *= lRange[0];
gRange[1] = b3Max((size_t)1, (numThreadsY / lRange[1]) + (!(numThreadsY % lRange[1]) ? 0 : 1));
gRange[1] *= lRange[1];
}
void enableSerialization(bool serialize)
cl_int status = clEnqueueNDRangeKernel(m_commandQueue,
m_kernel, 2, NULL, gRange, lRange, 0, 0, 0);
if (status != CL_SUCCESS)
{
m_enableSerialization = serialize;
printf("Error: OpenCL status = %d\n", status);
}
b3Assert(status == CL_SUCCESS);
}
void enableSerialization(bool serialize)
{
m_enableSerialization = serialize;
}
};
#endif //B3_LAUNCHER_CL_H
#endif //B3_LAUNCHER_CL_H

View File

@@ -7,16 +7,16 @@
template <typename T>
class b3OpenCLArray
{
size_t m_size;
size_t m_capacity;
cl_mem m_clBuffer;
size_t m_size;
size_t m_capacity;
cl_mem m_clBuffer;
cl_context m_clContext;
cl_context m_clContext;
cl_command_queue m_commandQueue;
bool m_ownsMemory;
bool m_ownsMemory;
bool m_allowGrowingCapacity;
bool m_allowGrowingCapacity;
void deallocate()
{
@@ -25,22 +25,19 @@ class b3OpenCLArray
clReleaseMemObject(m_clBuffer);
}
m_clBuffer = 0;
m_capacity=0;
m_capacity = 0;
}
b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);
B3_FORCE_INLINE size_t allocSize(size_t size)
{
return (size ? size*2 : 1);
}
B3_FORCE_INLINE size_t allocSize(size_t size)
{
return (size ? size * 2 : 1);
}
public:
b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true)
:m_size(0), m_capacity(0),m_clBuffer(0),
m_clContext(ctx),m_commandQueue(queue),
m_ownsMemory(true),m_allowGrowingCapacity(true)
b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity = 0, bool allowGrowingCapacity = true)
: m_size(0), m_capacity(0), m_clBuffer(0), m_clContext(ctx), m_commandQueue(queue), m_ownsMemory(true), m_allowGrowingCapacity(true)
{
if (initialCapacity)
{
@@ -60,34 +57,32 @@ public:
m_capacity = sizeInElements;
}
// we could enable this assignment, but need to make sure to avoid accidental deep copies
// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
// {
// copyFromArray(src);
// return *this;
// }
// we could enable this assignment, but need to make sure to avoid accidental deep copies
// b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
// {
// copyFromArray(src);
// return *this;
// }
cl_mem getBufferCL() const
cl_mem getBufferCL() const
{
return m_clBuffer;
}
virtual ~b3OpenCLArray()
{
deallocate();
m_size=0;
m_capacity=0;
m_size = 0;
m_capacity = 0;
}
B3_FORCE_INLINE bool push_back(const T& _Val,bool waitForCompletion=true)
B3_FORCE_INLINE bool push_back(const T& _Val, bool waitForCompletion = true)
{
bool result = true;
size_t sz = size();
if( sz == capacity() )
if (sz == capacity())
{
result = reserve( allocSize(size()) );
result = reserve(allocSize(size()));
}
copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
m_size++;
@@ -96,23 +91,23 @@ public:
B3_FORCE_INLINE T forcedAt(size_t n) const
{
b3Assert(n>=0);
b3Assert(n<capacity());
b3Assert(n >= 0);
b3Assert(n < capacity());
T elem;
copyToHostPointer(&elem,1,n,true);
copyToHostPointer(&elem, 1, n, true);
return elem;
}
B3_FORCE_INLINE T at(size_t n) const
{
b3Assert(n>=0);
b3Assert(n<size());
b3Assert(n >= 0);
b3Assert(n < size());
T elem;
copyToHostPointer(&elem,1,n,true);
copyToHostPointer(&elem, 1, n, true);
return elem;
}
B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents=true)
B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents = true)
{
bool result = true;
size_t curSize = size();
@@ -120,11 +115,12 @@ public:
if (newsize < curSize)
{
//leave the OpenCL memory for now
} else
}
else
{
if (newsize > size())
{
result = reserve(newsize,copyOldContents);
result = reserve(newsize, copyOldContents);
}
//leave new data uninitialized (init in debug mode?)
@@ -134,7 +130,8 @@ public:
if (result)
{
m_size = newsize;
} else
}
else
{
m_size = 0;
}
@@ -146,25 +143,25 @@ public:
return m_size;
}
B3_FORCE_INLINE size_t capacity() const
B3_FORCE_INLINE size_t capacity() const
{
return m_capacity;
}
B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents=true)
B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents = true)
{
bool result=true;
bool result = true;
// determine new minimum length of allocated storage
if (capacity() < _Count)
{ // not enough room, reallocate
{ // not enough room, reallocate
if (m_allowGrowingCapacity)
{
cl_int ciErrNum;
//create a new OpenCL buffer
size_t memSizeInBytes = sizeof(T)*_Count;
size_t memSizeInBytes = sizeof(T) * _Count;
cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
if (ciErrNum!=CL_SUCCESS)
if (ciErrNum != CL_SUCCESS)
{
b3Error("OpenCL out-of-memory\n");
_Count = 0;
@@ -173,13 +170,13 @@ public:
//#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
for (size_t i=0;i<memSizeInBytes;i++)
for (size_t i = 0; i < memSizeInBytes; i++)
src[i] = 0xbb;
ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
b3Assert(ciErrNum==CL_SUCCESS);
ciErrNum = clEnqueueWriteBuffer(m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0, 0, 0);
b3Assert(ciErrNum == CL_SUCCESS);
clFinish(m_commandQueue);
free(src);
#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
if (result)
{
@@ -193,21 +190,21 @@ public:
m_clBuffer = buf;
m_capacity = _Count;
} else
}
else
{
//fail: assert and
b3Assert(0);
deallocate();
result=false;
result = false;
}
}
return result;
}
void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const
void copyToCL(cl_mem destination, size_t numElements, size_t firstElem = 0, size_t dstOffsetInElems = 0) const
{
if (numElements<=0)
if (numElements <= 0)
return;
b3Assert(m_clBuffer);
@@ -216,75 +213,74 @@ public:
//likely some error, destination is same as source
b3Assert(m_clBuffer != destination);
b3Assert((firstElem+numElements)<=m_size);
b3Assert((firstElem + numElements) <= m_size);
cl_int status = 0;
b3Assert(numElements > 0);
b3Assert(numElements <= m_size);
b3Assert(numElements>0);
b3Assert(numElements<=m_size);
size_t srcOffsetBytes = sizeof(T) * firstElem;
size_t dstOffsetInBytes = sizeof(T) * dstOffsetInElems;
size_t srcOffsetBytes = sizeof(T)*firstElem;
size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
status = clEnqueueCopyBuffer(m_commandQueue, m_clBuffer, destination,
srcOffsetBytes, dstOffsetInBytes, sizeof(T) * numElements, 0, 0, 0);
status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
b3Assert( status == CL_SUCCESS );
b3Assert(status == CL_SUCCESS);
}
void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion = true)
{
size_t newSize = srcArray.size();
bool copyOldContents = false;
resize (newSize,copyOldContents);
resize(newSize, copyOldContents);
if (newSize)
copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
copyFromHostPointer(&srcArray[0], newSize, 0, waitForCompletion);
}
void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true)
void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem = 0, bool waitForCompletion = true)
{
b3Assert(numElems+destFirstElem <= capacity());
b3Assert(numElems + destFirstElem <= capacity());
if (numElems+destFirstElem)
if (numElems + destFirstElem)
{
cl_int status = 0;
size_t sizeInBytes=sizeof(T)*numElems;
status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
src, 0,0,0 );
b3Assert(status == CL_SUCCESS );
size_t sizeInBytes = sizeof(T) * numElems;
status = clEnqueueWriteBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * destFirstElem, sizeInBytes,
src, 0, 0, 0);
b3Assert(status == CL_SUCCESS);
if (waitForCompletion)
clFinish(m_commandQueue);
} else
}
else
{
b3Error("copyFromHostPointer invalid range\n");
}
}
void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion = true) const
{
destArray.resize(this->size());
if (size())
copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
copyToHostPointer(&destArray[0], size(), 0, waitForCompletion);
}
void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const
void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem = 0, bool waitForCompletion = true) const
{
b3Assert(numElem+srcFirstElem <= capacity());
b3Assert(numElem + srcFirstElem <= capacity());
if(numElem+srcFirstElem <= capacity())
if (numElem + srcFirstElem <= capacity())
{
cl_int status = 0;
status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
destPtr, 0,0,0 );
b3Assert( status==CL_SUCCESS );
status = clEnqueueReadBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * srcFirstElem, sizeof(T) * numElem,
destPtr, 0, 0, 0);
b3Assert(status == CL_SUCCESS);
if (waitForCompletion)
clFinish(m_commandQueue);
} else
}
else
{
b3Error("copyToHostPointer invalid range\n");
}
@@ -296,11 +292,9 @@ public:
resize(newSize);
if (size())
{
src.copyToCL(m_clBuffer,size());
src.copyToCL(m_clBuffer, size());
}
}
};
#endif //B3_OPENCL_ARRAY_H
#endif //B3_OPENCL_ARRAY_H

View File

@@ -7,25 +7,24 @@
#include "kernels/PrefixScanKernelsCL.h"
b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
:m_commandQueue(queue)
: m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsCL;
cl_int pErrNum;
char* additionalMacros=0;
char* additionalMacros = 0;
m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH);
m_workBuffer = new b3OpenCLArray<unsigned int>(ctx, queue, size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_PROG_PATH);
b3Assert(scanProg);
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_localScanKernel );
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_blockSumKernel );
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_propagationKernel );
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_localScanKernel);
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_blockSumKernel);
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_propagationKernel);
}
b3PrefixScanCL::~b3PrefixScanCL()
{
delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanCL::~b3PrefixScanCL()
clReleaseKernel(m_propagationKernel);
}
template<class T>
template <class T>
T b3NextPowerOf2(T n)
{
n -= 1;
for(int i=0; i<sizeof(T)*8; i++)
n = n | (n>>i);
return n+1;
for (int i = 0; i < sizeof(T) * 8; i++)
n = n | (n >> i);
return n + 1;
}
void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
{
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
dst.resize(src.size());
m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<uns
b3Int4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)b3NextPowerOf2( numBlocks );
constBuffer.z = (int)b3NextPowerOf2(numBlocks);
b3OpenCLArray<unsigned int>* srcNative = &src;
b3OpenCLArray<unsigned int>* dstNative = &dst;
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
}
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
}
if( numBlocks > 1 )
if (numBlocks > 1)
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
}
if( sum )
if (sum)
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum,1,n-1,true);
dstNative->copyToHostPointer(sum, 1, n - 1, true);
}
}
void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
{
unsigned int s = 0;
//if( data->m_option == EXCLUSIVE )
{
for(int i=0; i<n; i++)
for (int i = 0; i < n; i++)
{
dst[i] = s;
s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3Alig
}
*/
if( sum )
if (sum)
{
*sum = dst[n-1];
*sum = dst[n - 1];
}
}

View File

@@ -13,9 +13,9 @@ class b3PrefixScanCL
BLOCK_SIZE = 128
};
// Option m_option;
// Option m_option;
cl_command_queue m_commandQueue;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
@@ -23,15 +23,13 @@ class b3PrefixScanCL
b3OpenCLArray<unsigned int>* m_workBuffer;
public:
b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
public:
b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
virtual ~b3PrefixScanCL();
void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0);
void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum = 0);
};
#endif //B3_PREFIX_SCAN_CL_H
#endif //B3_PREFIX_SCAN_CL_H

View File

@@ -7,25 +7,24 @@
#include "kernels/PrefixScanKernelsFloat4CL.h"
b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
:m_commandQueue(queue)
: m_commandQueue(queue)
{
const char* scanKernelSource = prefixScanKernelsFloat4CL;
cl_int pErrNum;
char* additionalMacros=0;
char* additionalMacros = 0;
m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx, queue, size);
cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
b3Assert(scanProg);
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_localScanKernel );
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_blockSumKernel );
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
b3Assert(m_propagationKernel );
m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_localScanKernel);
m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_blockSumKernel);
m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
b3Assert(m_propagationKernel);
}
b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
{
delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
clReleaseKernel(m_propagationKernel);
}
template<class T>
template <class T>
T b3NextPowerOf2(T n)
{
n -= 1;
for(int i=0; i<sizeof(T)*8; i++)
n = n | (n>>i);
return n+1;
for (int i = 0; i < sizeof(T) * 8; i++)
n = n | (n >> i);
return n + 1;
}
void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
{
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
// b3Assert( data->m_option == EXCLUSIVE );
const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));
dst.resize(src.size());
m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<
b3Int4 constBuffer;
constBuffer.x = n;
constBuffer.y = numBlocks;
constBuffer.z = (int)b3NextPowerOf2( numBlocks );
constBuffer.z = (int)b3NextPowerOf2(numBlocks);
b3OpenCLArray<b3Vector3>* srcNative = &src;
b3OpenCLArray<b3Vector3>* dstNative = &dst;
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel");
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
{
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
}
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel");
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
}
if( numBlocks > 1 )
if (numBlocks > 1)
{
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel");
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( constBuffer );
launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(constBuffer);
launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
}
if( sum )
if (sum)
{
clFinish(m_commandQueue);
dstNative->copyToHostPointer(sum,1,n-1,true);
dstNative->copyToHostPointer(sum, 1, n - 1, true);
}
}
void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
{
b3Vector3 s=b3MakeVector3(0,0,0);
b3Vector3 s = b3MakeVector3(0, 0, 0);
//if( data->m_option == EXCLUSIVE )
{
for(int i=0; i<n; i++)
for (int i = 0; i < n; i++)
{
dst[i] = s;
s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3A
}
*/
if( sum )
if (sum)
{
*sum = dst[n-1];
*sum = dst[n - 1];
}
}

View File

@@ -14,9 +14,9 @@ class b3PrefixScanFloat4CL
BLOCK_SIZE = 128
};
// Option m_option;
// Option m_option;
cl_command_queue m_commandQueue;
cl_command_queue m_commandQueue;
cl_kernel m_localScanKernel;
cl_kernel m_blockSumKernel;
@@ -24,10 +24,8 @@ class b3PrefixScanFloat4CL
b3OpenCLArray<b3Vector3>* m_workBuffer;
public:
b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
public:
b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);
virtual ~b3PrefixScanFloat4CL();
@@ -35,4 +33,4 @@ class b3PrefixScanFloat4CL
void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
};
#endif //B3_PREFIX_SCAN_CL_H
#endif //B3_PREFIX_SCAN_CL_H

File diff suppressed because it is too large Load Diff

View File

@@ -6,90 +6,79 @@
struct b3SortData
{
union
{
union {
unsigned int m_key;
unsigned int x;
};
union
{
union {
unsigned int m_value;
unsigned int y;
};
};
#include "b3BufferInfoCL.h"
class b3RadixSort32CL
class b3RadixSort32CL
{
b3OpenCLArray<unsigned int>* m_workBuffer1;
b3OpenCLArray<unsigned int>* m_workBuffer2;
b3OpenCLArray<unsigned int>* m_workBuffer1;
b3OpenCLArray<unsigned int>* m_workBuffer2;
b3OpenCLArray<b3SortData>* m_workBuffer3;
b3OpenCLArray<b3SortData>* m_workBuffer4;
b3OpenCLArray<b3SortData>* m_workBuffer3;
b3OpenCLArray<b3SortData>* m_workBuffer4;
b3OpenCLArray<unsigned int>* m_workBuffer3a;
b3OpenCLArray<unsigned int>* m_workBuffer4a;
b3OpenCLArray<unsigned int>* m_workBuffer3a;
b3OpenCLArray<unsigned int>* m_workBuffer4a;
cl_command_queue m_commandQueue;
cl_command_queue m_commandQueue;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_streamCountSortDataKernel;
cl_kernel m_streamCountKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
cl_kernel m_prefixScanKernel;
cl_kernel m_sortAndScatterSortDataKernel;
cl_kernel m_sortAndScatterKernel;
bool m_deviceCPU;
bool m_deviceCPU;
class b3PrefixScanCL* m_scan;
class b3FillCL* m_fill;
class b3PrefixScanCL* m_scan;
class b3FillCL* m_fill;
public:
struct b3ConstData
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
{
int m_n;
int m_nWGs;
int m_startBit;
int m_nBlocksPerWG;
};
enum
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET=(1<<BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20*6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
{
DATA_ALIGNMENT = 256,
WG_SIZE = 64,
BLOCK_SIZE = 256,
ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
BITS_PER_PASS = 4,
NUM_BUCKET = (1 << BITS_PER_PASS),
// if you change this, change nPerWI in kernel as well
NUM_WGS = 20 * 6, // cypress
// NUM_WGS = 24*6, // cayman
// NUM_WGS = 32*4, // nv
};
private:
public:
b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);
b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
virtual ~b3RadixSort32CL();
virtual ~b3RadixSort32CL();
void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
///keys only
void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32 );
void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32 );
void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
///keys only
void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);
void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
};
#endif //B3_RADIXSORT32_H
#endif //B3_RADIXSORT32_H

View File

@@ -1,87 +1,86 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* boundSearchKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"typedef struct\n"
"{\n"
" u32 m_key; \n"
" u32 m_value;\n"
"}SortData;\n"
"typedef struct\n"
"{\n"
" u32 m_nSrc;\n"
" u32 m_nDst;\n"
" u32 m_padding[2];\n"
"} ConstBuffer;\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < nSrc )\n"
" {\n"
" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
" u32 k = jData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX+1;\n"
" if( gIdx < nSrc+1 )\n"
" {\n"
" SortData first; first.m_key = 0; first.m_value = 0;\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
" SortData iData = src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
" u32 k = iData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
" if( gIdx < nDst )\n"
" {\n"
" C[gIdx] = A[gIdx] - B[gIdx];\n"
" }\n"
"}\n"
;
static const char* boundSearchKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"typedef struct\n"
"{\n"
" u32 m_key; \n"
" u32 m_value;\n"
"}SortData;\n"
"typedef struct\n"
"{\n"
" u32 m_nSrc;\n"
" u32 m_nDst;\n"
" u32 m_padding[2];\n"
"} ConstBuffer;\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < nSrc )\n"
" {\n"
" SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
" SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
"// for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
" u32 k = jData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX+1;\n"
" if( gIdx < nSrc+1 )\n"
" {\n"
" SortData first; first.m_key = 0; first.m_value = 0;\n"
" SortData end; end.m_key = nDst; end.m_value = nDst;\n"
" SortData iData = src[gIdx-1];\n"
" SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
" if( iData.m_key != jData.m_key )\n"
" {\n"
" u32 k = iData.m_key;\n"
" {\n"
" dst[k] = gIdx;\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"__kernel\n"
"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
" unsigned int nSrc, unsigned int nDst)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" \n"
" if( gIdx < nDst )\n"
" {\n"
" C[gIdx] = A[gIdx] - B[gIdx];\n"
" }\n"
"}\n";

View File

@@ -1,132 +1,131 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* copyKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"typedef struct\n"
"{\n"
" int m_n;\n"
" int m_padding[3];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx];\n"
"\n"
" dst[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 2*gIdx <= cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx*2+0];\n"
" float4 a1 = src[gIdx*2+1];\n"
"\n"
" dst[ gIdx*2+0 ] = a0;\n"
" dst[ gIdx*2+1 ] = a1;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 4*gIdx <= cb.m_n )\n"
" {\n"
" int idx0 = gIdx*4+0;\n"
" int idx1 = gIdx*4+1;\n"
" int idx2 = gIdx*4+2;\n"
" int idx3 = gIdx*4+3;\n"
"\n"
" float4 a0 = src[idx0];\n"
" float4 a1 = src[idx1];\n"
" float4 a2 = src[idx2];\n"
" float4 a3 = src[idx3];\n"
"\n"
" dst[ idx0 ] = a0;\n"
" dst[ idx1 ] = a1;\n"
" dst[ idx2 ] = a2;\n"
" dst[ idx3 ] = a3;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float a0 = srcF1[gIdx];\n"
"\n"
" dstF1[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float2 a0 = srcF2[gIdx];\n"
"\n"
" dstF2[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"\n"
;
static const char* copyKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"\n"
"typedef struct\n"
"{\n"
" int m_n;\n"
" int m_padding[3];\n"
"} ConstBuffer;\n"
"\n"
"\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx];\n"
"\n"
" dst[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 2*gIdx <= cb.m_n )\n"
" {\n"
" float4 a0 = src[gIdx*2+0];\n"
" float4 a1 = src[gIdx*2+1];\n"
"\n"
" dst[ gIdx*2+0 ] = a0;\n"
" dst[ gIdx*2+1 ] = a1;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( 4*gIdx <= cb.m_n )\n"
" {\n"
" int idx0 = gIdx*4+0;\n"
" int idx1 = gIdx*4+1;\n"
" int idx2 = gIdx*4+2;\n"
" int idx3 = gIdx*4+3;\n"
"\n"
" float4 a0 = src[idx0];\n"
" float4 a1 = src[idx1];\n"
" float4 a2 = src[idx2];\n"
" float4 a3 = src[idx3];\n"
"\n"
" dst[ idx0 ] = a0;\n"
" dst[ idx1 ] = a1;\n"
" dst[ idx2 ] = a2;\n"
" dst[ idx3 ] = a3;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float a0 = srcF1[gIdx];\n"
"\n"
" dstF1[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
" ConstBuffer cb)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
"\n"
" if( gIdx < cb.m_n )\n"
" {\n"
" float2 a0 = srcF2[gIdx];\n"
"\n"
" dstF2[ gIdx ] = a0;\n"
" }\n"
"}\n"
"\n"
"\n";

View File

@@ -1,91 +1,90 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* fillKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"typedef struct\n"
"{\n"
" union\n"
" {\n"
" int4 m_data;\n"
" uint4 m_unsignedData;\n"
" float m_floatData;\n"
" };\n"
" int m_offset;\n"
" int m_n;\n"
" int m_padding[2];\n"
"} ConstBuffer;\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstFloat[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt4[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
;
static const char* fillKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define make_uint4 (uint4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"typedef struct\n"
"{\n"
" union\n"
" {\n"
" int4 m_data;\n"
" uint4 m_unsignedData;\n"
" float m_floatData;\n"
" };\n"
" int m_offset;\n"
" int m_n;\n"
" int m_padding[2];\n"
"} ConstBuffer;\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillIntKernel(__global int* dstInt, int num_elements, int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillFloatKernel(__global float* dstFloat, int num_elements, float value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num_elements )\n"
" {\n"
" dstFloat[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt[ offset+gIdx ] = value;\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt2Kernel(__global int2* dstInt2, const int num, const int2 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
" }\n"
"}\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(64,1,1)))\n"
"void FillInt4Kernel(__global int4* dstInt4, const int num, const int4 value, const int offset)\n"
"{\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" if( gIdx < num )\n"
" {\n"
" dstInt4[ offset+gIdx ] = value;\n"
" }\n"
"}\n";

View File

@@ -1,129 +1,128 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* prefixScanKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" u32 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" u32 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" return blocksum;\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
" uint4 cb)\n"
"{\n"
" __local u32 ldsData[WG_SIZE*2];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" u32 iBlockSum = blockSum[myIdx];\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
"{\n"
" __local u32 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n"
;
static const char* prefixScanKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" u32 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" u32 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" return blocksum;\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
" uint4 cb)\n"
"{\n"
" __local u32 ldsData[WG_SIZE*2];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
" u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" u32 iBlockSum = blockSum[myIdx];\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
"{\n"
" __local u32 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n";

View File

@@ -1,129 +1,128 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* prefixScanKernelsFloat4CL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" float4 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" float4 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" return blocksum;\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n"
"{\n"
" __local float4 ldsData[WG_SIZE*2];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" if( lIdx == 0 ) \n"
" sumBuffer[GET_GROUP_IDX] = sum;\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" float4 iBlockSum = blockSum[myIdx];\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
"{\n"
" __local float4 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n"
;
static const char* prefixScanKernelsFloat4CL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"typedef unsigned int u32;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"// takahiro end\n"
"#define WG_SIZE 128 \n"
"#define m_numElems x\n"
"#define m_numBlocks y\n"
"#define m_numScanBlocks z\n"
"/*typedef struct\n"
"{\n"
" uint m_numElems;\n"
" uint m_numBlocks;\n"
" uint m_numScanBlocks;\n"
" uint m_padding[1];\n"
"} ConstBuffer;\n"
"*/\n"
"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
"{\n"
" float4 blocksum;\n"
" int offset = 1;\n"
" for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" data[bi] += data[ai];\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" blocksum = data[ n-1 ];\n"
" data[ n-1 ] = 0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" offset >>= 1;\n"
" for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
" {\n"
" GROUP_LDS_BARRIER;\n"
" for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
" {\n"
" int ai = offset*(2*iIdx+1)-1;\n"
" int bi = offset*(2*iIdx+2)-1;\n"
" float4 temp = data[ai];\n"
" data[ai] = data[bi];\n"
" data[bi] += temp;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" return blocksum;\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer, uint4 cb)\n"
"{\n"
" __local float4 ldsData[WG_SIZE*2];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" ldsData[2*lIdx] = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
" ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
" float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" if( lIdx == 0 ) \n"
" sumBuffer[GET_GROUP_IDX] = sum;\n"
" if( (2*gIdx) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx] = ldsData[2*lIdx];\n"
" }\n"
" if( (2*gIdx + 1) < cb.m_numElems )\n"
" {\n"
" dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
"{\n"
" const u32 blockSize = WG_SIZE*2;\n"
" int myIdx = GET_GROUP_IDX+1;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" float4 iBlockSum = blockSum[myIdx];\n"
" int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
" for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
" {\n"
" dst[i] += iBlockSum;\n"
" }\n"
"}\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"__kernel\n"
"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
"{\n"
" __local float4 ldsData[2048];\n"
" int gIdx = GET_GLOBAL_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int lSize = GET_GROUP_SIZE;\n"
" for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
" {\n"
" ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
" for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
" {\n"
" dst[i] = ldsData[i];\n"
" }\n"
" if( gIdx == 0 )\n"
" {\n"
" dst[cb.m_numBlocks] = sum;\n"
" }\n"
"}\n";

View File

@@ -4,7 +4,6 @@
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
@@ -15,38 +14,35 @@
#include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"
#define B3_RAYCAST_PATH "src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl"
struct b3GpuRaycastInternalData
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_q;
cl_command_queue m_q;
cl_kernel m_raytraceKernel;
cl_kernel m_raytracePairsKernel;
cl_kernel m_findRayRigidPairIndexRanges;
b3GpuParallelLinearBvh* m_plbvh;
b3RadixSort32CL* m_radixSorter;
b3FillCL* m_fill;
//1 element per ray
b3OpenCLArray<b3RayInfo>* m_gpuRays;
b3OpenCLArray<b3RayHit>* m_gpuHitResults;
b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
//1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB
b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index
b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs; //x == ray index, y == rigid index
int m_test;
};
b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue q)
b3GpuRaycast::b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q)
{
m_data = new b3GpuRaycastInternalData;
m_data->m_context = ctx;
@@ -59,7 +55,7 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
m_data->m_fill = new b3FillCL(ctx, device, q);
m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
@@ -68,19 +64,17 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);
{
cl_int errNum=0;
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,rayCastKernelCL,&errNum,"",B3_RAYCAST_PATH);
b3Assert(errNum==CL_SUCCESS);
m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
cl_int errNum = 0;
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, &errNum, "", B3_RAYCAST_PATH);
b3Assert(errNum == CL_SUCCESS);
m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastPairsKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "findRayRigidPairIndexRanges", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
clReleaseProgram(prog);
}
}
b3GpuRaycast::~b3GpuRaycast()
@@ -88,78 +82,80 @@ b3GpuRaycast::~b3GpuRaycast()
clReleaseKernel(m_data->m_raytraceKernel);
clReleaseKernel(m_data->m_raytracePairsKernel);
clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
delete m_data->m_plbvh;
delete m_data->m_radixSorter;
delete m_data->m_fill;
delete m_data->m_gpuRays;
delete m_data->m_gpuHitResults;
delete m_data->m_firstRayRigidPairIndexPerRay;
delete m_data->m_numRayRigidPairsPerRay;
delete m_data->m_gpuNumRayRigidPairs;
delete m_data->m_gpuRayRigidPairs;
delete m_data;
}
bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction)
bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction)
{
b3Vector3 rs = rayFrom - spherePos;
b3Vector3 rayDir = rayTo-rayFrom;
float A = b3Dot(rayDir,rayDir);
float B = b3Dot(rs, rayDir);
float C = b3Dot(rs, rs) - (radius * radius);
float D = B * B - A*C;
b3Vector3 rs = rayFrom - spherePos;
b3Vector3 rayDir = rayTo - rayFrom;
if (D > 0.0)
{
float t = (-B - sqrt(D))/A;
float A = b3Dot(rayDir, rayDir);
float B = b3Dot(rs, rayDir);
float C = b3Dot(rs, rs) - (radius * radius);
if ( (t >= 0.0f) && (t < hitFraction) )
{
float D = B * B - A * C;
if (D > 0.0)
{
float t = (-B - sqrt(D)) / A;
if ((t >= 0.0f) && (t < hitFraction))
{
hitFraction = t;
return true;
return true;
}
}
return false;
}
bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const b3ConvexPolyhedronData& poly,
const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal)
const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal)
{
float exitFraction = hitFraction;
float enterFraction = -0.1f;
b3Vector3 curHitNormal=b3MakeVector3(0,0,0);
for (int i=0;i<poly.m_numFaces;i++)
b3Vector3 curHitNormal = b3MakeVector3(0, 0, 0);
for (int i = 0; i < poly.m_numFaces; i++)
{
const b3GpuFace& face = faces[poly.m_faceOffset+i];
float fromPlaneDist = b3Dot(rayFromLocal,face.m_plane)+face.m_plane.w;
float toPlaneDist = b3Dot(rayToLocal,face.m_plane)+face.m_plane.w;
if (fromPlaneDist<0.f)
const b3GpuFace& face = faces[poly.m_faceOffset + i];
float fromPlaneDist = b3Dot(rayFromLocal, face.m_plane) + face.m_plane.w;
float toPlaneDist = b3Dot(rayToLocal, face.m_plane) + face.m_plane.w;
if (fromPlaneDist < 0.f)
{
if (toPlaneDist >= 0.f)
{
float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
if (exitFraction>fraction)
float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
if (exitFraction > fraction)
{
exitFraction = fraction;
}
}
} else
}
}
else
{
if (toPlaneDist<0.f)
if (toPlaneDist < 0.f)
{
float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
if (enterFraction <= fraction)
{
enterFraction = fraction;
curHitNormal = face.m_plane;
curHitNormal.w = 0.f;
}
} else
}
else
{
return false;
}
@@ -176,44 +172,41 @@ bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const
return true;
}
void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables,const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
{
// return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables);
// return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables);
B3_PROFILE("castRaysHost");
for (int r=0;r<rays.size();r++)
for (int r = 0; r < rays.size(); r++)
{
b3Vector3 rayFrom = rays[r].m_from;
b3Vector3 rayTo = rays[r].m_to;
float hitFraction = hitResults[r].m_hitFraction;
int hitBodyIndex= -1;
int hitBodyIndex = -1;
b3Vector3 hitNormal;
for (int b=0;b<numBodies;b++)
for (int b = 0; b < numBodies; b++)
{
const b3Vector3& pos = bodies[b].m_pos;
//const b3Quaternion& orn = bodies[b].m_quat;
switch (collidables[bodies[b].m_collidableIdx].m_shapeType)
{
case SHAPE_SPHERE:
case SHAPE_SPHERE:
{
b3Scalar radius = collidables[bodies[b].m_collidableIdx].m_radius;
if (sphere_intersect(pos, radius, rayFrom, rayTo,hitFraction))
if (sphere_intersect(pos, radius, rayFrom, rayTo, hitFraction))
{
hitBodyIndex = b;
b3Vector3 hitPoint;
hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction);
hitNormal = (hitPoint-bodies[b].m_pos).normalize();
hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
hitNormal = (hitPoint - bodies[b].m_pos).normalize();
}
}
case SHAPE_CONVEX_HULL:
case SHAPE_CONVEX_HULL:
{
b3Transform convexWorldTransform;
convexWorldTransform.setIdentity();
convexWorldTransform.setOrigin(bodies[b].m_pos);
@@ -222,72 +215,67 @@ void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3A
b3Vector3 rayFromLocal = convexWorld2Local(rayFrom);
b3Vector3 rayToLocal = convexWorld2Local(rayTo);
int shapeIndex = collidables[bodies[b].m_collidableIdx].m_shapeIndex;
const b3ConvexPolyhedronData& poly = narrowphaseData->m_convexPolyhedra[shapeIndex];
if (rayConvex(rayFromLocal, rayToLocal,poly,narrowphaseData->m_convexFaces, hitFraction, hitNormal))
if (rayConvex(rayFromLocal, rayToLocal, poly, narrowphaseData->m_convexFaces, hitFraction, hitNormal))
{
hitBodyIndex = b;
}
break;
}
default:
default:
{
static bool once=true;
static bool once = true;
if (once)
{
once=false;
once = false;
b3Warning("Raytest: unsupported shape type\n");
}
}
}
}
if (hitBodyIndex>=0)
if (hitBodyIndex >= 0)
{
hitResults[r].m_hitFraction = hitFraction;
hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction);
hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
hitResults[r].m_hitNormal = hitNormal;
hitResults[r].m_hitBody = hitBodyIndex;
}
}
}
///todo: add some acceleration structure (AABBs, tree etc)
void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase)
void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase)
{
//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);
B3_PROFILE("castRaysGPU");
{
B3_PROFILE("raycast copyFromHost");
m_data->m_gpuRays->copyFromHost(rays);
m_data->m_gpuHitResults->copyFromHost(hitResults);
}
int numRays = hitResults.size();
{
m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
m_data->m_numRayRigidPairsPerRay->resize(numRays);
m_data->m_gpuNumRayRigidPairs->resize(1);
m_data->m_gpuRayRigidPairs->resize(numRays * 16);
}
//run kernel
const bool USE_BRUTE_FORCE_RAYCAST = false;
if(USE_BRUTE_FORCE_RAYCAST)
if (USE_BRUTE_FORCE_RAYCAST)
{
B3_PROFILE("raycast launch1D");
b3LauncherCL launcher(m_data->m_q,m_data->m_raytraceKernel,"m_raytraceKernel");
b3LauncherCL launcher(m_data->m_q, m_data->m_raytraceKernel, "m_raytraceKernel");
int numRays = rays.size();
launcher.setConst(numRays);
@@ -299,93 +287,88 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3Align
launcher.setBuffer(narrowphaseData->m_collidablesGPU->getBufferCL());
launcher.setBuffer(narrowphaseData->m_convexFacesGPU->getBufferCL());
launcher.setBuffer(narrowphaseData->m_convexPolyhedraGPU->getBufferCL());
launcher.launch1D(numRays);
clFinish(m_data->m_q);
}
else
{
m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() );
m_data->m_plbvh->build(broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU());
m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
int numRayRigidPairs = -1;
m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() )
if (numRayRigidPairs > m_data->m_gpuRayRigidPairs->size())
{
numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
}
m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct
m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs); //Radix sort needs b3OpenCLArray::size() to be correct
//Sort ray-rigid pairs by ray index
{
B3_PROFILE("sort ray-rigid pairs");
m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) );
m_data->m_radixSorter->execute(*reinterpret_cast<b3OpenCLArray<b3SortData>*>(m_data->m_gpuRayRigidPairs));
}
//detect start,count of each ray pair
{
B3_PROFILE("detect ray-rigid pair index ranges");
{
B3_PROFILE("reset ray-rigid pair index ranges");
m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index
m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays); //atomic_min used to find first index
m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
clFinish(m_data->m_q);
}
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ),
b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL()),
b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL())};
b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numRayRigidPairs);
launcher.launch1D(numRayRigidPairs);
clFinish(m_data->m_q);
}
{
B3_PROFILE("ray-rigid intersection");
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ),
b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ),
b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ),
b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ),
b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() )
};
b3BufferInfoCL bufferInfo[] =
{
b3BufferInfoCL(m_data->m_gpuRays->getBufferCL()),
b3BufferInfoCL(m_data->m_gpuHitResults->getBufferCL()),
b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_bodyBufferGPU->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_collidablesGPU->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_convexFacesGPU->getBufferCL()),
b3BufferInfoCL(narrowphaseData->m_convexPolyhedraGPU->getBufferCL()),
b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL())};
b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
launcher.setConst(numRays);
launcher.launch1D(numRays);
clFinish(m_data->m_q);
}
}
//copy results
{
B3_PROFILE("raycast copyToHost");
m_data->m_gpuHitResults->copyToHost(hitResults);
}
}

View File

@@ -7,26 +7,22 @@
#include "Bullet3Common/b3AlignedObjectArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"
class b3GpuRaycast
{
protected:
struct b3GpuRaycastInternalData* m_data;
public:
b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue q);
b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q);
virtual ~b3GpuRaycast();
void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData);
void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData);
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
};
#endif //B3_GPU_RAYCAST_H
#endif //B3_GPU_RAYCAST_H

View File

@@ -1,381 +1,380 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* rayCastKernelCL= \
"#define SHAPE_CONVEX_HULL 3\n"
"#define SHAPE_PLANE 4\n"
"#define SHAPE_CONCAVE_TRIMESH 5\n"
"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
"#define SHAPE_SPHERE 7\n"
"typedef struct\n"
"{\n"
" float4 m_from;\n"
" float4 m_to;\n"
"} b3RayInfo;\n"
"typedef struct\n"
"{\n"
" float m_hitFraction;\n"
" int m_hitResult0;\n"
" int m_hitResult1;\n"
" int m_hitResult2;\n"
" float4 m_hitPoint;\n"
" float4 m_hitNormal;\n"
"} b3RayHit;\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" unsigned int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"typedef struct Collidable\n"
"{\n"
" union {\n"
" int m_numChildShapes;\n"
" int m_bvhIndex;\n"
" };\n"
" float m_radius;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
"} Collidable;\n"
"typedef struct \n"
"{\n"
" float4 m_localCenter;\n"
" float4 m_extents;\n"
" float4 mC;\n"
" float4 mE;\n"
" float m_radius;\n"
" int m_faceOffset;\n"
" int m_numFaces;\n"
" int m_numVertices;\n"
" int m_vertexOffset;\n"
" int m_uniqueEdgesOffset;\n"
" int m_numUniqueEdges;\n"
" int m_unused;\n"
"} ConvexPolyhedronCL;\n"
"typedef struct\n"
"{\n"
" float4 m_plane;\n"
" int m_indexOffset;\n"
" int m_numIndices;\n"
"} b3GpuFace;\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"typedef float4 Quaternion;\n"
"__inline\n"
" Quaternion qtMul(Quaternion a, Quaternion b);\n"
"__inline\n"
" Quaternion qtNormalize(Quaternion in);\n"
"__inline\n"
" Quaternion qtInvert(Quaternion q);\n"
"__inline\n"
" float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = (float4)(a.xyz,0.f);\n"
" float4 b1 = (float4)(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"__inline\n"
" Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross( a, b );\n"
" ans += a.w*b+b.w*a;\n"
" // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"__inline\n"
" Quaternion qtNormalize(Quaternion in)\n"
"{\n"
" return fast_normalize(in);\n"
" // in /= length( in );\n"
" // return in;\n"
"}\n"
"__inline\n"
" float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(q,vcpy);\n"
" out = qtMul(out,qInv);\n"
" return out;\n"
"}\n"
"__inline\n"
" Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"__inline\n"
" float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
" return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"void trInverse(float4 translationIn, Quaternion orientationIn,\n"
" float4* translationOut, Quaternion* orientationOut)\n"
"{\n"
" *orientationOut = qtInvert(orientationIn);\n"
" *translationOut = qtRotate(*orientationOut, -translationIn);\n"
"}\n"
"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
" __global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
"{\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" bool result = true;\n"
" float exitFraction = hitFraction[0];\n"
" float enterFraction = -0.3f;\n"
" float4 curHitNormal = (float4)(0,0,0,0);\n"
" for (int i=0;i<numFaces && result;i++)\n"
" {\n"
" b3GpuFace face = faces[faceOffset+i];\n"
" float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n"
" float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n"
" if (fromPlaneDist<0.f)\n"
" {\n"
" if (toPlaneDist >= 0.f)\n"
" {\n"
" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
" if (exitFraction>fraction)\n"
" {\n"
" exitFraction = fraction;\n"
" }\n"
" } \n"
" } else\n"
" {\n"
" if (toPlaneDist<0.f)\n"
" {\n"
" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
" if (enterFraction <= fraction)\n"
" {\n"
" enterFraction = fraction;\n"
" curHitNormal = face.m_plane;\n"
" curHitNormal.w = 0.f;\n"
" }\n"
" } else\n"
" {\n"
" result = false;\n"
" }\n"
" }\n"
" if (exitFraction <= enterFraction)\n"
" result = false;\n"
" }\n"
" if (enterFraction < 0.f)\n"
" {\n"
" result = false;\n"
" }\n"
" if (result)\n"
" { \n"
" hitFraction[0] = enterFraction;\n"
" hitNormal[0] = curHitNormal;\n"
" }\n"
" return result;\n"
"}\n"
"bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
"{\n"
" float4 rs = rayFrom - spherePos;\n"
" rs.w = 0.f;\n"
" float4 rayDir = rayTo-rayFrom;\n"
" rayDir.w = 0.f;\n"
" float A = dot(rayDir,rayDir);\n"
" float B = dot(rs, rayDir);\n"
" float C = dot(rs, rs) - (radius * radius);\n"
" float D = B * B - A*C;\n"
" if (D > 0.0f)\n"
" {\n"
" float t = (-B - sqrt(D))/A;\n"
" if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
" {\n"
" *hitFraction = t;\n"
" return true;\n"
" }\n"
" }\n"
" return false;\n"
"}\n"
"float4 setInterpolate3(float4 from, float4 to, float t)\n"
"{\n"
" float s = 1.0f - t;\n"
" float4 result;\n"
" result = s * from + t * to;\n"
" result.w = 0.f; \n"
" return result; \n"
"}\n"
"__kernel void rayCastKernel( \n"
" int numRays, \n"
" const __global b3RayInfo* rays, \n"
" __global b3RayHit* hitResults, \n"
" const int numBodies, \n"
" __global Body* bodies,\n"
" __global Collidable* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const ConvexPolyhedronCL* convexShapes )\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numRays)\n"
" return;\n"
" hitResults[i].m_hitFraction = 1.f;\n"
" float4 rayFrom = rays[i].m_from;\n"
" float4 rayTo = rays[i].m_to;\n"
" float hitFraction = 1.f;\n"
" float4 hitPoint;\n"
" float4 hitNormal;\n"
" int hitBodyIndex= -1;\n"
" int cachedCollidableIndex = -1;\n"
" Collidable cachedCollidable;\n"
" for (int b=0;b<numBodies;b++)\n"
" {\n"
" if (hitResults[i].m_hitResult2==b)\n"
" continue;\n"
" Body body = bodies[b];\n"
" float4 pos = body.m_pos;\n"
" float4 orn = body.m_quat;\n"
" if (cachedCollidableIndex != body.m_collidableIdx)\n"
" {\n"
" cachedCollidableIndex = body.m_collidableIdx;\n"
" cachedCollidable = collidables[cachedCollidableIndex];\n"
" }\n"
" if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
" {\n"
" float4 invPos = (float4)(0,0,0,0);\n"
" float4 invOrn = (float4)(0,0,0,0);\n"
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
" float4 rayToLocal = (float4)(0,0,0,0);\n"
" invOrn = qtInvert(orn);\n"
" invPos = qtRotate(invOrn, -pos);\n"
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n"
" int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n"
" if (numFaces)\n"
" {\n"
" if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
" {\n"
" hitBodyIndex = b;\n"
" \n"
" }\n"
" }\n"
" }\n"
" if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n"
" {\n"
" float radius = cachedCollidable.m_radius;\n"
" \n"
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n"
" }\n"
" }\n"
" }\n"
" if (hitBodyIndex>=0)\n"
" {\n"
" hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
" hitResults[i].m_hitFraction = hitFraction;\n"
" hitResults[i].m_hitPoint = hitPoint;\n"
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
"}\n"
"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
" __global int* out_firstRayRigidPairIndexPerRay,\n"
" __global int* out_numRayRigidPairsPerRay,\n"
" int numRayRigidPairs)\n"
"{\n"
" int rayRigidPairIndex = get_global_id(0);\n"
" if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
" \n"
" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
" \n"
" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
"}\n"
"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
" __global b3RayHit* hitResults, \n"
" __global int* firstRayRigidPairIndexPerRay,\n"
" __global int* numRayRigidPairsPerRay,\n"
" \n"
" __global Body* bodies,\n"
" __global Collidable* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const ConvexPolyhedronCL* convexShapes,\n"
" \n"
" __global int2* rayRigidPairs,\n"
" int numRays)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i >= numRays) return;\n"
" \n"
" float4 rayFrom = rays[i].m_from;\n"
" float4 rayTo = rays[i].m_to;\n"
" \n"
" hitResults[i].m_hitFraction = 1.f;\n"
" \n"
" float hitFraction = 1.f;\n"
" float4 hitPoint;\n"
" float4 hitNormal;\n"
" int hitBodyIndex = -1;\n"
" \n"
" //\n"
" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
" {\n"
" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
" int b = rayRigidPairs[rayRigidPairIndex].y;\n"
" \n"
" if (hitResults[i].m_hitResult2 == b) continue;\n"
" \n"
" Body body = bodies[b];\n"
" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
" \n"
" float4 pos = body.m_pos;\n"
" float4 orn = body.m_quat;\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
" {\n"
" float4 invPos = (float4)(0,0,0,0);\n"
" float4 invOrn = (float4)(0,0,0,0);\n"
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
" float4 rayToLocal = (float4)(0,0,0,0);\n"
" invOrn = qtInvert(orn);\n"
" invPos = qtRotate(invOrn, -pos);\n"
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
" \n"
" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" }\n"
" }\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
" {\n"
" float radius = rigidCollidable.m_radius;\n"
" \n"
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
" }\n"
" }\n"
" }\n"
" \n"
" if (hitBodyIndex >= 0)\n"
" {\n"
" hitResults[i].m_hitFraction = hitFraction;\n"
" hitResults[i].m_hitPoint = hitPoint;\n"
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
" \n"
"}\n"
;
static const char* rayCastKernelCL =
"#define SHAPE_CONVEX_HULL 3\n"
"#define SHAPE_PLANE 4\n"
"#define SHAPE_CONCAVE_TRIMESH 5\n"
"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
"#define SHAPE_SPHERE 7\n"
"typedef struct\n"
"{\n"
" float4 m_from;\n"
" float4 m_to;\n"
"} b3RayInfo;\n"
"typedef struct\n"
"{\n"
" float m_hitFraction;\n"
" int m_hitResult0;\n"
" int m_hitResult1;\n"
" int m_hitResult2;\n"
" float4 m_hitPoint;\n"
" float4 m_hitNormal;\n"
"} b3RayHit;\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" float4 m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" unsigned int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"typedef struct Collidable\n"
"{\n"
" union {\n"
" int m_numChildShapes;\n"
" int m_bvhIndex;\n"
" };\n"
" float m_radius;\n"
" int m_shapeType;\n"
" int m_shapeIndex;\n"
"} Collidable;\n"
"typedef struct \n"
"{\n"
" float4 m_localCenter;\n"
" float4 m_extents;\n"
" float4 mC;\n"
" float4 mE;\n"
" float m_radius;\n"
" int m_faceOffset;\n"
" int m_numFaces;\n"
" int m_numVertices;\n"
" int m_vertexOffset;\n"
" int m_uniqueEdgesOffset;\n"
" int m_numUniqueEdges;\n"
" int m_unused;\n"
"} ConvexPolyhedronCL;\n"
"typedef struct\n"
"{\n"
" float4 m_plane;\n"
" int m_indexOffset;\n"
" int m_numIndices;\n"
"} b3GpuFace;\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"typedef float4 Quaternion;\n"
"__inline\n"
" Quaternion qtMul(Quaternion a, Quaternion b);\n"
"__inline\n"
" Quaternion qtNormalize(Quaternion in);\n"
"__inline\n"
" Quaternion qtInvert(Quaternion q);\n"
"__inline\n"
" float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = (float4)(a.xyz,0.f);\n"
" float4 b1 = (float4)(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"__inline\n"
" Quaternion qtMul(Quaternion a, Quaternion b)\n"
"{\n"
" Quaternion ans;\n"
" ans = cross( a, b );\n"
" ans += a.w*b+b.w*a;\n"
" // ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"__inline\n"
" Quaternion qtNormalize(Quaternion in)\n"
"{\n"
" return fast_normalize(in);\n"
" // in /= length( in );\n"
" // return in;\n"
"}\n"
"__inline\n"
" float4 qtRotate(Quaternion q, float4 vec)\n"
"{\n"
" Quaternion qInv = qtInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = qtMul(q,vcpy);\n"
" out = qtMul(out,qInv);\n"
" return out;\n"
"}\n"
"__inline\n"
" Quaternion qtInvert(Quaternion q)\n"
"{\n"
" return (Quaternion)(-q.xyz, q.w);\n"
"}\n"
"__inline\n"
" float4 qtInvRotate(const Quaternion q, float4 vec)\n"
"{\n"
" return qtRotate( qtInvert( q ), vec );\n"
"}\n"
"void trInverse(float4 translationIn, Quaternion orientationIn,\n"
" float4* translationOut, Quaternion* orientationOut)\n"
"{\n"
" *orientationOut = qtInvert(orientationIn);\n"
" *translationOut = qtRotate(*orientationOut, -translationIn);\n"
"}\n"
"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
" __global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
"{\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" bool result = true;\n"
" float exitFraction = hitFraction[0];\n"
" float enterFraction = -0.3f;\n"
" float4 curHitNormal = (float4)(0,0,0,0);\n"
" for (int i=0;i<numFaces && result;i++)\n"
" {\n"
" b3GpuFace face = faces[faceOffset+i];\n"
" float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n"
" float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n"
" if (fromPlaneDist<0.f)\n"
" {\n"
" if (toPlaneDist >= 0.f)\n"
" {\n"
" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
" if (exitFraction>fraction)\n"
" {\n"
" exitFraction = fraction;\n"
" }\n"
" } \n"
" } else\n"
" {\n"
" if (toPlaneDist<0.f)\n"
" {\n"
" float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
" if (enterFraction <= fraction)\n"
" {\n"
" enterFraction = fraction;\n"
" curHitNormal = face.m_plane;\n"
" curHitNormal.w = 0.f;\n"
" }\n"
" } else\n"
" {\n"
" result = false;\n"
" }\n"
" }\n"
" if (exitFraction <= enterFraction)\n"
" result = false;\n"
" }\n"
" if (enterFraction < 0.f)\n"
" {\n"
" result = false;\n"
" }\n"
" if (result)\n"
" { \n"
" hitFraction[0] = enterFraction;\n"
" hitNormal[0] = curHitNormal;\n"
" }\n"
" return result;\n"
"}\n"
"bool sphere_intersect(float4 spherePos, float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
"{\n"
" float4 rs = rayFrom - spherePos;\n"
" rs.w = 0.f;\n"
" float4 rayDir = rayTo-rayFrom;\n"
" rayDir.w = 0.f;\n"
" float A = dot(rayDir,rayDir);\n"
" float B = dot(rs, rayDir);\n"
" float C = dot(rs, rs) - (radius * radius);\n"
" float D = B * B - A*C;\n"
" if (D > 0.0f)\n"
" {\n"
" float t = (-B - sqrt(D))/A;\n"
" if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
" {\n"
" *hitFraction = t;\n"
" return true;\n"
" }\n"
" }\n"
" return false;\n"
"}\n"
"float4 setInterpolate3(float4 from, float4 to, float t)\n"
"{\n"
" float s = 1.0f - t;\n"
" float4 result;\n"
" result = s * from + t * to;\n"
" result.w = 0.f; \n"
" return result; \n"
"}\n"
"__kernel void rayCastKernel( \n"
" int numRays, \n"
" const __global b3RayInfo* rays, \n"
" __global b3RayHit* hitResults, \n"
" const int numBodies, \n"
" __global Body* bodies,\n"
" __global Collidable* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const ConvexPolyhedronCL* convexShapes )\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numRays)\n"
" return;\n"
" hitResults[i].m_hitFraction = 1.f;\n"
" float4 rayFrom = rays[i].m_from;\n"
" float4 rayTo = rays[i].m_to;\n"
" float hitFraction = 1.f;\n"
" float4 hitPoint;\n"
" float4 hitNormal;\n"
" int hitBodyIndex= -1;\n"
" int cachedCollidableIndex = -1;\n"
" Collidable cachedCollidable;\n"
" for (int b=0;b<numBodies;b++)\n"
" {\n"
" if (hitResults[i].m_hitResult2==b)\n"
" continue;\n"
" Body body = bodies[b];\n"
" float4 pos = body.m_pos;\n"
" float4 orn = body.m_quat;\n"
" if (cachedCollidableIndex != body.m_collidableIdx)\n"
" {\n"
" cachedCollidableIndex = body.m_collidableIdx;\n"
" cachedCollidable = collidables[cachedCollidableIndex];\n"
" }\n"
" if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
" {\n"
" float4 invPos = (float4)(0,0,0,0);\n"
" float4 invOrn = (float4)(0,0,0,0);\n"
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
" float4 rayToLocal = (float4)(0,0,0,0);\n"
" invOrn = qtInvert(orn);\n"
" invPos = qtRotate(invOrn, -pos);\n"
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n"
" int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n"
" if (numFaces)\n"
" {\n"
" if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
" {\n"
" hitBodyIndex = b;\n"
" \n"
" }\n"
" }\n"
" }\n"
" if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n"
" {\n"
" float radius = cachedCollidable.m_radius;\n"
" \n"
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n"
" }\n"
" }\n"
" }\n"
" if (hitBodyIndex>=0)\n"
" {\n"
" hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
" hitResults[i].m_hitFraction = hitFraction;\n"
" hitResults[i].m_hitPoint = hitPoint;\n"
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
"}\n"
"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
" __global int* out_firstRayRigidPairIndexPerRay,\n"
" __global int* out_numRayRigidPairsPerRay,\n"
" int numRayRigidPairs)\n"
"{\n"
" int rayRigidPairIndex = get_global_id(0);\n"
" if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
" \n"
" int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
" \n"
" atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
" atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
"}\n"
"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
" __global b3RayHit* hitResults, \n"
" __global int* firstRayRigidPairIndexPerRay,\n"
" __global int* numRayRigidPairsPerRay,\n"
" \n"
" __global Body* bodies,\n"
" __global Collidable* collidables,\n"
" __global const b3GpuFace* faces,\n"
" __global const ConvexPolyhedronCL* convexShapes,\n"
" \n"
" __global int2* rayRigidPairs,\n"
" int numRays)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i >= numRays) return;\n"
" \n"
" float4 rayFrom = rays[i].m_from;\n"
" float4 rayTo = rays[i].m_to;\n"
" \n"
" hitResults[i].m_hitFraction = 1.f;\n"
" \n"
" float hitFraction = 1.f;\n"
" float4 hitPoint;\n"
" float4 hitNormal;\n"
" int hitBodyIndex = -1;\n"
" \n"
" //\n"
" for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
" {\n"
" int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
" int b = rayRigidPairs[rayRigidPairIndex].y;\n"
" \n"
" if (hitResults[i].m_hitResult2 == b) continue;\n"
" \n"
" Body body = bodies[b];\n"
" Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
" \n"
" float4 pos = body.m_pos;\n"
" float4 orn = body.m_quat;\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
" {\n"
" float4 invPos = (float4)(0,0,0,0);\n"
" float4 invOrn = (float4)(0,0,0,0);\n"
" float4 rayFromLocal = (float4)(0,0,0,0);\n"
" float4 rayToLocal = (float4)(0,0,0,0);\n"
" invOrn = qtInvert(orn);\n"
" invPos = qtRotate(invOrn, -pos);\n"
" rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
" rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
" rayFromLocal.w = 0.f;\n"
" rayToLocal.w = 0.f;\n"
" int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
" int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
" \n"
" if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" }\n"
" }\n"
" \n"
" if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
" {\n"
" float radius = rigidCollidable.m_radius;\n"
" \n"
" if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
" {\n"
" hitBodyIndex = b;\n"
" hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
" hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
" }\n"
" }\n"
" }\n"
" \n"
" if (hitBodyIndex >= 0)\n"
" {\n"
" hitResults[i].m_hitFraction = hitFraction;\n"
" hitResults[i].m_hitPoint = hitPoint;\n"
" hitResults[i].m_hitNormal = normalize(hitNormal);\n"
" hitResults[i].m_hitResult0 = hitBodyIndex;\n"
" }\n"
" \n"
"}\n";

View File

@@ -5,14 +5,13 @@
#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4 : public b3ContactConstraint4
B3_ATTRIBUTE_ALIGNED16(struct)
b3GpuConstraint4 : public b3ContactConstraint4
{
B3_DECLARE_ALIGNED_ALLOCATOR();
inline void setFrictionCoeff(float value) { m_linear[3] = value; }
inline float getFrictionCoeff() const { return m_linear[3]; }
inline void setFrictionCoeff(float value) { m_linear[3] = value; }
inline float getFrictionCoeff() const { return m_linear[3]; }
};
#endif //B3_CONSTRAINT4_h
#endif //B3_CONSTRAINT4_h

View File

@@ -19,11 +19,11 @@ subject to the following restrictions:
#include <new>
#include "Bullet3Common/b3Transform.h"
void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData* bodies)
void b3GpuGenericConstraint::getInfo1(unsigned int* info, const b3RigidBodyData* bodies)
{
switch (m_constraintType)
{
case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
{
*info = 3;
break;
@@ -35,7 +35,7 @@ void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData*
};
}
void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
{
b3Transform trA;
trA.setIdentity();
@@ -47,54 +47,52 @@ void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo
trB.setOrigin(bodies[constraint->m_rbB].m_pos);
trB.setRotation(bodies[constraint->m_rbB].m_quat);
// anchor points in global coordinates with respect to body PORs.
// set jacobian
info->m_J1linearAxis[0] = 1;
info->m_J1linearAxis[info->rowskip+1] = 1;
info->m_J1linearAxis[2*info->rowskip+2] = 1;
// anchor points in global coordinates with respect to body PORs.
b3Vector3 a1 = trA.getBasis()*constraint->getPivotInA();
// set jacobian
info->m_J1linearAxis[0] = 1;
info->m_J1linearAxis[info->rowskip + 1] = 1;
info->m_J1linearAxis[2 * info->rowskip + 2] = 1;
b3Vector3 a1 = trA.getBasis() * constraint->getPivotInA();
//b3Vector3 a1a = b3QuatRotate(trA.getRotation(),constraint->getPivotInA());
{
b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis+info->rowskip);
b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis+2*info->rowskip);
b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis + info->rowskip);
b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis + 2 * info->rowskip);
b3Vector3 a1neg = -a1;
a1neg.getSkewSymmetricMatrix(angular0,angular1,angular2);
a1neg.getSkewSymmetricMatrix(angular0, angular1, angular2);
}
if (info->m_J2linearAxis)
{
info->m_J2linearAxis[0] = -1;
info->m_J2linearAxis[info->rowskip+1] = -1;
info->m_J2linearAxis[2*info->rowskip+2] = -1;
info->m_J2linearAxis[info->rowskip + 1] = -1;
info->m_J2linearAxis[2 * info->rowskip + 2] = -1;
}
b3Vector3 a2 = trB.getBasis()*constraint->getPivotInB();
b3Vector3 a2 = trB.getBasis() * constraint->getPivotInB();
{
// b3Vector3 a2n = -a2;
// b3Vector3 a2n = -a2;
b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis+info->rowskip);
b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis+2*info->rowskip);
a2.getSkewSymmetricMatrix(angular0,angular1,angular2);
b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis + info->rowskip);
b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis + 2 * info->rowskip);
a2.getSkewSymmetricMatrix(angular0, angular1, angular2);
}
// set right hand side
// b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
// set right hand side
// b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
b3Scalar currERP = info->erp;
b3Scalar k = info->fps * currERP;
int j;
for (j=0; j<3; j++)
{
info->m_constraintError[j*info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]);
int j;
for (j = 0; j < 3; j++)
{
info->m_constraintError[j * info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]);
//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
}
}
#if 0
if(m_flags & B3_P2P_FLAGS_CFM)
{
@@ -117,21 +115,20 @@ void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo
}
info->m_damping = m_setting.m_damping;
#endif
}
void b3GpuGenericConstraint::getInfo2 (b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
void b3GpuGenericConstraint::getInfo2(b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
{
switch (m_constraintType)
{
case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
{
getInfo2Point2Point(this,info,bodies);
getInfo2Point2Point(this, info, bodies);
break;
};
default:
{
b3Assert(0);
}
{
b3Assert(0);
}
};
}

View File

@@ -20,37 +20,35 @@ subject to the following restrictions:
struct b3RigidBodyData;
enum B3_CONSTRAINT_FLAGS
{
B3_CONSTRAINT_FLAG_ENABLED=1,
B3_CONSTRAINT_FLAG_ENABLED = 1,
};
enum b3GpuGenericConstraintType
{
B3_GPU_POINT2POINT_CONSTRAINT_TYPE=3,
B3_GPU_FIXED_CONSTRAINT_TYPE=4,
// B3_HINGE_CONSTRAINT_TYPE,
// B3_CONETWIST_CONSTRAINT_TYPE,
// B3_D6_CONSTRAINT_TYPE,
// B3_SLIDER_CONSTRAINT_TYPE,
// B3_CONTACT_CONSTRAINT_TYPE,
// B3_D6_SPRING_CONSTRAINT_TYPE,
// B3_GEAR_CONSTRAINT_TYPE,
B3_GPU_POINT2POINT_CONSTRAINT_TYPE = 3,
B3_GPU_FIXED_CONSTRAINT_TYPE = 4,
// B3_HINGE_CONSTRAINT_TYPE,
// B3_CONETWIST_CONSTRAINT_TYPE,
// B3_D6_CONSTRAINT_TYPE,
// B3_SLIDER_CONSTRAINT_TYPE,
// B3_CONTACT_CONSTRAINT_TYPE,
// B3_D6_SPRING_CONSTRAINT_TYPE,
// B3_GEAR_CONSTRAINT_TYPE,
B3_GPU_MAX_CONSTRAINT_TYPE
};
struct b3GpuConstraintInfo2
struct b3GpuConstraintInfo2
{
// integrator parameters: frames per second (1/stepsize), default error
// reduction parameter (0..1).
b3Scalar fps,erp;
b3Scalar fps, erp;
// for the first and second body, pointers to two (linear and angular)
// n*3 jacobian sub matrices, stored by rows. these matrices will have
// been initialized to 0 on entry. if the second body is zero then the
// J2xx pointers may be 0.
b3Scalar *m_J1linearAxis,*m_J1angularAxis,*m_J2linearAxis,*m_J2angularAxis;
b3Scalar *m_J1linearAxis, *m_J1angularAxis, *m_J2linearAxis, *m_J2angularAxis;
// elements to jump from one row to the next in J's
int rowskip;
@@ -58,44 +56,44 @@ struct b3GpuConstraintInfo2
// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
// "constraint force mixing" vector. c is set to zero on entry, cfm is
// set to a constant value (typically very small or zero) value on entry.
b3Scalar *m_constraintError,*cfm;
b3Scalar *m_constraintError, *cfm;
// lo and hi limits for variables (set to -/+ infinity on entry).
b3Scalar *m_lowerLimit,*m_upperLimit;
b3Scalar *m_lowerLimit, *m_upperLimit;
// findex vector for variables. see the LCP solver interface for a
// description of what this does. this is set to -1 on entry.
// note that the returned indexes are relative to the first index of
// the constraint.
int *findex;
int* findex;
// number of solver iterations
int m_numIterations;
//damping of the velocity
b3Scalar m_damping;
b3Scalar m_damping;
};
B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint
B3_ATTRIBUTE_ALIGNED16(struct)
b3GpuGenericConstraint
{
int m_constraintType;
int m_rbA;
int m_rbB;
float m_breakingImpulseThreshold;
int m_constraintType;
int m_rbA;
int m_rbB;
float m_breakingImpulseThreshold;
b3Vector3 m_pivotInA;
b3Vector3 m_pivotInB;
b3Quaternion m_relTargetAB;
int m_flags;
int m_flags;
int m_uid;
int m_padding[2];
int getRigidBodyA() const
int getRigidBodyA() const
{
return m_rbA;
}
int getRigidBodyB() const
int getRigidBodyB() const
{
return m_rbB;
}
@@ -121,12 +119,10 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint
}
///internal method used by the constraint solver, don't use them directly
void getInfo1 (unsigned int* info,const b3RigidBodyData* bodies);
void getInfo1(unsigned int* info, const b3RigidBodyData* bodies);
///internal method used by the constraint solver, don't use them directly
void getInfo2 (b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies);
void getInfo2(b3GpuConstraintInfo2 * info, const b3RigidBodyData* bodies);
};
#endif //B3_GPU_GENERIC_CONSTRAINT_H
#endif //B3_GPU_GENERIC_CONSTRAINT_H

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,6 @@
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
//struct b3InertiaData;
//b3InertiaData
@@ -21,21 +20,20 @@ struct b3JacobiSolverInfo
float m_deltaTime;
float m_positionDrift;
float m_positionConstraintCoeff;
int m_numIterations;
int m_numIterations;
b3JacobiSolverInfo()
:m_fixedBodyIndex(0),
m_deltaTime(1./60.f),
m_positionDrift( 0.005f ),
m_positionConstraintCoeff( 0.99f ),
m_numIterations(7)
: m_fixedBodyIndex(0),
m_deltaTime(1. / 60.f),
m_positionDrift(0.005f),
m_positionConstraintCoeff(0.99f),
m_numIterations(7)
{
}
};
class b3GpuJacobiContactSolver
{
protected:
struct b3GpuJacobiSolverInternalData* m_data;
cl_context m_context;
@@ -43,20 +41,16 @@ protected:
cl_command_queue m_queue;
public:
b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
virtual ~b3GpuJacobiContactSolver();
void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
void solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,struct b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo);
void solveGroupHost(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, struct b3Contact4* manifoldPtr, int numManifolds, const b3JacobiSolverInfo& solverInfo);
//void solveGroupHost(btRigidBodyCL* bodies,b3InertiaData* inertias,int numBodies,btContact4* manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btJacobiSolverInfo& solverInfo);
//b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
//void solveGroup(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
//void solveGroupMixed(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
};
#endif //B3_GPU_JACOBI_CONTACT_SOLVER_H
#endif //B3_GPU_JACOBI_CONTACT_SOLVER_H

File diff suppressed because it is too large Load Diff

View File

@@ -9,11 +9,10 @@
class b3GpuNarrowPhase
{
protected:
struct b3GpuNarrowPhaseInternalData* m_data;
struct b3GpuNarrowPhaseInternalData* m_data;
int m_acceleratedCompanionShapeIndex;
int m_planeBodyIndex;
int m_static0Index;
int m_static0Index;
cl_context m_context;
cl_device_id m_device;
@@ -23,64 +22,58 @@ protected:
int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);
public:
b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config);
virtual ~b3GpuNarrowPhase(void);
int registerSphereShape(float radius);
int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
int registerSphereShape(float radius);
int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
int registerFace(const b3Vector3& faceNormal, float faceConstant);
int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling);
int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
//do they need to be merged?
int registerConvexHullShape(b3ConvexUtility* utilPtr);
int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
void setObjectTransform(const float* position, const float* orientation , int bodyIndex);
int registerConvexHullShape(b3ConvexUtility* utilPtr);
int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
void writeAllBodiesToGpu();
void reset();
void readbackAllBodiesToCpu();
bool getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const;
int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax, bool writeToGpu);
void setObjectTransform(const float* position, const float* orientation, int bodyIndex);
void setObjectTransformCpu(float* position, float* orientation , int bodyIndex);
void writeAllBodiesToGpu();
void reset();
void readbackAllBodiesToCpu();
bool getObjectTransformFromCpu(float* position, float* orientation, int bodyIndex) const;
void setObjectTransformCpu(float* position, float* orientation, int bodyIndex);
void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex);
virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects);
cl_mem getBodiesGpu();
cl_mem getBodiesGpu();
const struct b3RigidBodyData* getBodiesCpu() const;
//struct b3RigidBodyData* getBodiesCpu();
int getNumBodiesGpu() const;
int getNumBodiesGpu() const;
cl_mem getBodyInertiasGpu();
int getNumBodyInertiasGpu() const;
cl_mem getBodyInertiasGpu();
int getNumBodyInertiasGpu() const;
cl_mem getCollidablesGpu();
cl_mem getCollidablesGpu();
const struct b3Collidable* getCollidablesCpu() const;
int getNumCollidablesGpu() const;
int getNumCollidablesGpu() const;
const struct b3SapAabb* getLocalSpaceAabbsCpu() const;
const struct b3Contact4* getContactsCPU() const;
cl_mem getContactsGpu();
int getNumContactsGpu() const;
cl_mem getContactsGpu();
int getNumContactsGpu() const;
cl_mem getAabbLocalSpaceBufferGpu();
cl_mem getAabbLocalSpaceBufferGpu();
int getNumRigidBodies() const;
int allocateCollidable();
@@ -92,18 +85,17 @@ public:
b3Collidable& getCollidableCpu(int collidableIndex);
const b3Collidable& getCollidableCpu(int collidableIndex) const;
const b3GpuNarrowPhaseInternalData* getInternalData() const
const b3GpuNarrowPhaseInternalData* getInternalData() const
{
return m_data;
return m_data;
}
b3GpuNarrowPhaseInternalData* getInternalData()
b3GpuNarrowPhaseInternalData* getInternalData()
{
return m_data;
return m_data;
}
const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const;
};
#endif //B3_GPU_NARROWPHASE_H
#endif //B3_GPU_NARROWPHASE_H

View File

@@ -20,57 +20,53 @@
#include "Bullet3Common/shared/b3Int4.h"
#include "Bullet3Common/shared/b3Int2.h"
class b3ConvexUtility;
struct b3GpuNarrowPhaseInternalData
{
b3AlignedObjectArray<b3ConvexUtility*>* m_convexData;
b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra;
b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
b3AlignedObjectArray<b3Vector3> m_convexVertices;
b3AlignedObjectArray<int> m_convexIndices;
b3OpenCLArray<b3ConvexPolyhedronData>* m_convexPolyhedraGPU;
b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU;
b3OpenCLArray<b3Vector3>* m_convexVerticesGPU;
b3OpenCLArray<int>* m_convexIndicesGPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes;
b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes;
b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes;
b3AlignedObjectArray<b3GpuFace> m_convexFaces;
b3OpenCLArray<b3GpuFace>* m_convexFacesGPU;
struct GpuSatCollision* m_gpuSatCollision;
b3OpenCLArray<b3Int4>* m_triangleConvexPairs;
struct GpuSatCollision* m_gpuSatCollision;
b3OpenCLArray<b3Int4>* m_triangleConvexPairs;
b3OpenCLArray<b3Contact4>* m_pBufContactBuffersGPU[2];
int m_currentContactBuffer;
int m_currentContactBuffer;
b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU;
b3AlignedObjectArray<b3RigidBodyData>* m_bodyBufferCPU;
b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU;
b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU;
b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
int m_numAcceleratedShapes;
int m_numAcceleratedRigidBodies;
b3AlignedObjectArray<b3Collidable> m_collidablesCPU;
b3OpenCLArray<b3Collidable>* m_collidablesGPU;
b3AlignedObjectArray<b3Collidable> m_collidablesCPU;
b3OpenCLArray<b3Collidable>* m_collidablesGPU;
b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU;
b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU;
@@ -78,18 +74,16 @@ struct b3GpuNarrowPhaseInternalData
b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData;
b3AlignedObjectArray<class b3TriangleIndexVertexArray*> m_meshInterfaces;
b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU;
b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU;
b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU;
b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU;
b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU;
b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU;
b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU;
b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU;
b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU;
b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU;
b3Config m_config;
b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU;
b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU;
b3Config m_config;
};
#endif //B3_GPU_NARROWPHASE_INTERNAL_DATA_H
#endif //B3_GPU_NARROWPHASE_INTERNAL_DATA_H

File diff suppressed because it is too large Load Diff

View File

@@ -19,7 +19,6 @@ subject to the following restrictions:
struct b3Contact4;
struct b3ContactPoint;
class b3Dispatcher;
#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
@@ -38,41 +37,40 @@ class b3GpuPgsConstraintSolver
protected:
int m_staticIdx;
struct b3GpuPgsJacobiSolverInternalData* m_gpuData;
protected:
b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool;
b3GpuConstraintArray m_tmpSolverContactConstraintPool;
b3GpuConstraintArray m_tmpSolverNonContactConstraintPool;
b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool;
b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool;
protected:
b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool;
b3GpuConstraintArray m_tmpSolverContactConstraintPool;
b3GpuConstraintArray m_tmpSolverNonContactConstraintPool;
b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool;
b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool;
b3AlignedObjectArray<unsigned int> m_tmpConstraintSizesPool;
bool m_usePgs;
void averageVelocities();
bool m_usePgs;
void averageVelocities();
int m_maxOverrideNumSolverIterations;
int m_maxOverrideNumSolverIterations;
int m_numSplitImpulseRecoveries;
int m_numSplitImpulseRecoveries;
// int getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb);
// int getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb);
public:
b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs);
virtual~b3GpuPgsConstraintSolver ();
b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs);
virtual ~b3GpuPgsConstraintSolver();
virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal);
virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal);
virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias,
int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints);
b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias,
int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints);
int sortConstraintByBatch3( struct b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies);
void recomputeBatches();
int sortConstraintByBatch3(struct b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies);
void recomputeBatches();
};
#endif //B3_GPU_PGS_CONSTRAINT_SOLVER_H
#endif //B3_GPU_PGS_CONSTRAINT_SOLVER_H

File diff suppressed because it is too large Load Diff

View File

@@ -11,33 +11,27 @@
class b3GpuPgsContactSolver
{
protected:
int m_debugOutput;
struct b3GpuBatchingPgsSolverInternalData* m_data;
struct b3GpuBatchingPgsSolverInternalData* m_data;
void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
inline int sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
inline int sortConstraintByBatch2( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
inline int sortConstraintByBatch3( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies, int* batchSizes);
void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx);
void solveContactConstraintBatchSizes( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes);
inline int sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
inline int sortConstraintByBatch2(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
inline int sortConstraintByBatch3(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies, int* batchSizes);
void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes);
void solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes); //const b3OpenCLArray<int>* gpuBatchSizes);
void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes); //const b3OpenCLArray<int>* gpuBatchSizes);
public:
b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue q,int pairCapacity);
b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity);
virtual ~b3GpuPgsContactSolver();
void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
};
#endif //B3_GPU_BATCHING_PGS_SOLVER_H
#endif //B3_GPU_BATCHING_PGS_SOLVER_H

View File

@@ -47,7 +47,7 @@ bool gClearPairsOnGpu = true;
#define TEST_OTHER_GPU_SOLVER 1
#ifdef TEST_OTHER_GPU_SOLVER
#include "b3GpuJacobiContactSolver.h"
#endif //TEST_OTHER_GPU_SOLVER
#endif //TEST_OTHER_GPU_SOLVER
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
@@ -59,73 +59,68 @@ bool gClearPairsOnGpu = true;
#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
#include "Bullet3OpenCL/Raycast/b3GpuRaycast.h"
#include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
#include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"
b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q,class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap , struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
{
m_data = new b3GpuRigidBodyPipelineInternalData;
m_data->m_constraintUid=0;
m_data->m_constraintUid = 0;
m_data->m_config = config;
m_data->m_context = ctx;
m_data->m_device = device;
m_data->m_queue = q;
m_data->m_solver = new b3PgsJacobiSolver(true);//new b3PgsJacobiSolver(true);
m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx,device,q,true);//new b3PgsJacobiSolver(true);
m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx,q,config.m_maxConvexBodies);
m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx,q,config.m_maxBroadphasePairs);
m_data->m_solver = new b3PgsJacobiSolver(true); //new b3PgsJacobiSolver(true);
m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx, device, q, true); //new b3PgsJacobiSolver(true);
m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx,q);
m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx, q, config.m_maxConvexBodies);
m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx, q, config.m_maxBroadphasePairs);
m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx, q);
#ifdef TEST_OTHER_GPU_SOLVER
m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx,device,q,config.m_maxBroadphasePairs);
#endif // TEST_OTHER_GPU_SOLVER
m_data->m_solver2 = new b3GpuPgsContactSolver(ctx,device,q,config.m_maxBroadphasePairs);
m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
#endif // TEST_OTHER_GPU_SOLVER
m_data->m_raycaster = new b3GpuRaycast(ctx,device,q);
m_data->m_solver2 = new b3GpuPgsContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
m_data->m_raycaster = new b3GpuRaycast(ctx, device, q);
m_data->m_broadphaseDbvt = broadphaseDbvt;
m_data->m_broadphaseSap = broadphaseSap;
m_data->m_narrowphase = narrowphase;
m_data->m_gravity.setValue(0.f,-9.8f,0.f);
m_data->m_gravity.setValue(0.f, -9.8f, 0.f);
cl_int errNum=0;
cl_int errNum = 0;
{
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,integrateKernelCL,&errNum,"",B3_RIGIDBODY_INTEGRATE_PATH);
b3Assert(errNum==CL_SUCCESS);
m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,integrateKernelCL, "integrateTransformsKernel",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, integrateKernelCL, &errNum, "", B3_RIGIDBODY_INTEGRATE_PATH);
b3Assert(errNum == CL_SUCCESS);
m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, integrateKernelCL, "integrateTransformsKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
clReleaseProgram(prog);
}
{
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,updateAabbsKernelCL,&errNum,"",B3_RIGIDBODY_UPDATEAABB_PATH);
b3Assert(errNum==CL_SUCCESS);
m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "initializeGpuAabbsFull",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, &errNum, "", B3_RIGIDBODY_UPDATEAABB_PATH);
b3Assert(errNum == CL_SUCCESS);
m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "initializeGpuAabbsFull", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "clearOverlappingPairsKernel",&errNum,prog);
b3Assert(errNum==CL_SUCCESS);
m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "clearOverlappingPairsKernel", &errNum, prog);
b3Assert(errNum == CL_SUCCESS);
clReleaseProgram(prog);
}
}
b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()
{
if (m_data->m_integrateTransformsKernel)
clReleaseKernel(m_data->m_integrateTransformsKernel);
if (m_data->m_updateAabbsKernel)
clReleaseKernel(m_data->m_updateAabbsKernel);
if (m_data->m_clearOverlappingPairsKernel)
clReleaseKernel(m_data->m_clearOverlappingPairsKernel);
delete m_data->m_raycaster;
@@ -136,15 +131,14 @@ b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()
#ifdef TEST_OTHER_GPU_SOLVER
delete m_data->m_solver3;
#endif //TEST_OTHER_GPU_SOLVER
#endif //TEST_OTHER_GPU_SOLVER
delete m_data->m_solver2;
delete m_data;
}
void b3GpuRigidBodyPipeline::reset()
void b3GpuRigidBodyPipeline::reset()
{
m_data->m_gpuConstraints->resize(0);
m_data->m_cpuConstraints.resize(0);
@@ -152,30 +146,28 @@ void b3GpuRigidBodyPipeline::reset()
m_data->m_allAabbsCPU.resize(0);
}
void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
{
m_data->m_joints.push_back(constraint);
}
void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint)
void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint)
{
m_data->m_joints.remove(constraint);
}
void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
{
m_data->m_gpuSolver->recomputeBatches();
//slow linear search
m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
//remove
for (int i=0;i<m_data->m_cpuConstraints.size();i++)
for (int i = 0; i < m_data->m_cpuConstraints.size(); i++)
{
if (m_data->m_cpuConstraints[i].m_uid == uid)
{
//m_data->m_cpuConstraints.remove(m_data->m_cpuConstraints[i]);
m_data->m_cpuConstraints.swap(i,m_data->m_cpuConstraints.size()-1);
m_data->m_cpuConstraints.swap(i, m_data->m_cpuConstraints.size() - 1);
m_data->m_cpuConstraints.pop_back();
break;
@@ -185,13 +177,13 @@ void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
if (m_data->m_cpuConstraints.size())
{
m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
} else
}
else
{
m_data->m_gpuConstraints->resize(0);
}
}
int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold)
int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold)
{
m_data->m_gpuSolver->recomputeBatches();
b3GpuGenericConstraint c;
@@ -200,14 +192,14 @@ int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, co
c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
c.m_rbA = bodyA;
c.m_rbB = bodyB;
c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]);
c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]);
c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
c.m_breakingImpulseThreshold = breakingThreshold;
c.m_constraintType = B3_GPU_POINT2POINT_CONSTRAINT_TYPE;
m_data->m_cpuConstraints.push_back(c);
return c.m_uid;
}
int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB,float breakingThreshold)
int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold)
{
m_data->m_gpuSolver->recomputeBatches();
b3GpuGenericConstraint c;
@@ -216,9 +208,9 @@ int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const fl
c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
c.m_rbA = bodyA;
c.m_rbB = bodyB;
c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]);
c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]);
c.m_relTargetAB.setValue(relTargetAB[0],relTargetAB[1],relTargetAB[2],relTargetAB[3]);
c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
c.m_relTargetAB.setValue(relTargetAB[0], relTargetAB[1], relTargetAB[2], relTargetAB[3]);
c.m_breakingImpulseThreshold = breakingThreshold;
c.m_constraintType = B3_GPU_FIXED_CONSTRAINT_TYPE;
@@ -226,31 +218,28 @@ int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const fl
return c.m_uid;
}
void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
{
//update worldspace AABBs from local AABB/worldtransform
{
B3_PROFILE("setupGpuAabbs");
setupGpuAabbsFull();
}
int numPairs =0;
int numPairs = 0;
//compute overlapping pairs
{
if (gUseDbvt)
{
{
B3_PROFILE("setAabb");
m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU);
for (int i=0;i<m_data->m_allAabbsCPU.size();i++)
for (int i = 0; i < m_data->m_allAabbsCPU.size(); i++)
{
b3Vector3 aabbMin=b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0],m_data->m_allAabbsCPU[i].m_min[1],m_data->m_allAabbsCPU[i].m_min[2]);
b3Vector3 aabbMax=b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0],m_data->m_allAabbsCPU[i].m_max[1],m_data->m_allAabbsCPU[i].m_max[2]);
m_data->m_broadphaseDbvt->setAabb(i,aabbMin,aabbMax,0);
b3Vector3 aabbMin = b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0], m_data->m_allAabbsCPU[i].m_min[1], m_data->m_allAabbsCPU[i].m_min[2]);
b3Vector3 aabbMax = b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0], m_data->m_allAabbsCPU[i].m_max[1], m_data->m_allAabbsCPU[i].m_max[2]);
m_data->m_broadphaseDbvt->setAabb(i, aabbMin, aabbMax, 0);
}
}
@@ -259,13 +248,14 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
m_data->m_broadphaseDbvt->calculateOverlappingPairs();
}
numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs();
} else
}
else
{
if (gUseCalculateOverlappingPairsHost)
{
m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs);
} else
}
else
{
m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs);
}
@@ -274,24 +264,24 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
}
//compute contact points
// printf("numPairs=%d\n",numPairs);
int numContacts = 0;
// printf("numPairs=%d\n",numPairs);
int numContacts = 0;
int numBodies = m_data->m_narrowphase->getNumRigidBodies();
if (numPairs)
{
cl_mem pairs =0;
cl_mem aabbsWS =0;
cl_mem pairs = 0;
cl_mem aabbsWS = 0;
if (gUseDbvt)
{
B3_PROFILE("m_overlappingPairsGPU->copyFromHost");
m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
pairs = m_data->m_overlappingPairsGPU->getBufferCL();
aabbsWS = m_data->m_allAabbsGPU->getBufferCL();
} else
}
else
{
pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer();
aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
@@ -302,31 +292,27 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
//mark the contacts for each pair as 'unused'
if (numPairs)
{
b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context,m_data->m_queue);
gpuPairs.setFromOpenCLBuffer(pairs,numPairs);
b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context, m_data->m_queue);
gpuPairs.setFromOpenCLBuffer(pairs, numPairs);
if (gClearPairsOnGpu)
{
//b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging
//gpuPairs.copyToHost(hostPairs);
b3LauncherCL launcher(m_data->m_queue,m_data->m_clearOverlappingPairsKernel,"clearOverlappingPairsKernel");
b3LauncherCL launcher(m_data->m_queue, m_data->m_clearOverlappingPairsKernel, "clearOverlappingPairsKernel");
launcher.setBuffer(pairs);
launcher.setConst(numPairs);
launcher.launch1D(numPairs);
//gpuPairs.copyToHost(hostPairs);
} else
}
else
{
b3AlignedObjectArray<b3BroadphasePair> hostPairs;
gpuPairs.copyToHost(hostPairs);
for (int i=0;i<hostPairs.size();i++)
for (int i = 0; i < hostPairs.size(); i++)
{
hostPairs[i].z = 0xffffffff;
}
@@ -335,7 +321,7 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
}
}
m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies);
m_data->m_narrowphase->computeContacts(pairs, numPairs, aabbsWS, numBodies);
numContacts = m_data->m_narrowphase->getNumContactsGpu();
if (gUseDbvt)
@@ -347,56 +333,54 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
if (gDumpContactStats && numContacts)
{
m_data->m_narrowphase->getContactsGpu();
printf("numContacts = %d\n", numContacts);
int totalPoints = 0;
int totalPoints = 0;
const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU();
for (int i=0;i<numContacts;i++)
for (int i = 0; i < numContacts; i++)
{
totalPoints += contacts->getNPoints();
}
printf("totalPoints=%d\n",totalPoints);
printf("totalPoints=%d\n", totalPoints);
}
}
//convert contact points to contact constraints
//solve constraints
b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context,m_data->m_queue,0,true);
gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(),m_data->m_narrowphase->getNumRigidBodies());
b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context,m_data->m_queue,0,true);
gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(),m_data->m_narrowphase->getNumRigidBodies());
b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context,m_data->m_queue,0,true);
gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(),m_data->m_narrowphase->getNumContactsGpu());
b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context, m_data->m_queue, 0, true);
gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(), m_data->m_narrowphase->getNumRigidBodies());
b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context, m_data->m_queue, 0, true);
gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(), m_data->m_narrowphase->getNumRigidBodies());
b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context, m_data->m_queue, 0, true);
gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(), m_data->m_narrowphase->getNumContactsGpu());
int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size();
int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size();
if (useBullet2CpuSolver && numJoints)
{
// b3AlignedObjectArray<b3Contact4> hostContacts;
// b3AlignedObjectArray<b3Contact4> hostContacts;
//gpuContacts.copyToHost(hostContacts);
{
bool useGpu = m_data->m_joints.size()==0;
bool useGpu = m_data->m_joints.size() == 0;
// b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
// b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints);
if (useGpu)
{
m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(),&gpuBodies,&gpuInertias,numJoints, m_data->m_gpuConstraints);
} else
m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(), &gpuBodies, &gpuInertias, numJoints, m_data->m_gpuConstraints);
}
else
{
b3AlignedObjectArray<b3RigidBodyData> hostBodies;
gpuBodies.copyToHost(hostBodies);
b3AlignedObjectArray<b3InertiaData> hostInertias;
gpuInertias.copyToHost(hostInertias);
b3TypedConstraint** joints = numJoints? &m_data->m_joints[0] : 0;
m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints);
b3TypedConstraint** joints = numJoints ? &m_data->m_joints[0] : 0;
m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(), &hostBodies[0], &hostInertias[0], 0, 0, numJoints, joints);
gpuBodies.copyFromHost(hostBodies);
}
}
@@ -404,22 +388,20 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
if (numContacts)
{
#ifdef TEST_OTHER_GPU_SOLVER
if (gUseJacobi)
{
bool useGpu = true;
if (useGpu)
{
bool forceHost = false;
if (forceHost)
{
b3AlignedObjectArray<b3RigidBodyData> hostBodies;
b3AlignedObjectArray<b3InertiaData> hostInertias;
b3AlignedObjectArray<b3Contact4> hostContacts;
{
B3_PROFILE("copyToHost");
gpuBodies.copyToHost(hostBodies);
@@ -429,25 +411,24 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
{
b3JacobiSolverInfo solverInfo;
m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(),&hostContacts[0],hostContacts.size(),solverInfo);
m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(), &hostContacts[0], hostContacts.size(), solverInfo);
}
{
B3_PROFILE("copyFromHost");
gpuBodies.copyFromHost(hostBodies);
}
} else
}
else
{
int static0Index = m_data->m_narrowphase->getStatic0Index();
b3JacobiSolverInfo solverInfo;
//m_data->m_solver3->solveContacts( >solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo);
//m_data->m_solver3->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index);
m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
}
} else
}
else
{
b3AlignedObjectArray<b3RigidBodyData> hostBodies;
gpuBodies.copyToHost(hostBodies);
@@ -460,17 +441,15 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
}
gpuBodies.copyFromHost(hostBodies);
}
} else
#endif //TEST_OTHER_GPU_SOLVER
}
else
#endif //TEST_OTHER_GPU_SOLVER
{
int static0Index = m_data->m_narrowphase->getStatic0Index();
m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index);
m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
//m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL());
/*m_data->m_solver3->solveContactConstraintHost(
(b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies,
(b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias,
@@ -481,11 +460,9 @@ void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
}
integrate(deltaTime);
}
void b3GpuRigidBodyPipeline::integrate(float timeStep)
void b3GpuRigidBodyPipeline::integrate(float timeStep)
{
//integrate
int numBodies = m_data->m_narrowphase->getNumRigidBodies();
@@ -493,24 +470,25 @@ void b3GpuRigidBodyPipeline::integrate(float timeStep)
if (gIntegrateOnCpu)
{
if(numBodies)
if (numBodies)
{
b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData();
b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData();
npData->m_bodyBufferGPU->copyToHost(*npData->m_bodyBufferCPU);
b3RigidBodyData_t* bodies = &npData->m_bodyBufferCPU->at(0);
for (int nodeID=0;nodeID<numBodies;nodeID++)
for (int nodeID = 0; nodeID < numBodies; nodeID++)
{
integrateSingleTransform( bodies,nodeID, timeStep, angularDamp, m_data->m_gravity);
integrateSingleTransform(bodies, nodeID, timeStep, angularDamp, m_data->m_gravity);
}
npData->m_bodyBufferGPU->copyFromHost(*npData->m_bodyBufferCPU);
}
} else
}
else
{
b3LauncherCL launcher(m_data->m_queue,m_data->m_integrateTransformsKernel,"m_integrateTransformsKernel");
b3LauncherCL launcher(m_data->m_queue, m_data->m_integrateTransformsKernel, "m_integrateTransformsKernel");
launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu());
launcher.setConst(numBodies);
launcher.setConst(timeStep);
launcher.setConst(angularDamp);
@@ -519,12 +497,9 @@ void b3GpuRigidBodyPipeline::integrate(float timeStep)
}
}
void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
{
cl_int ciErrNum=0;
cl_int ciErrNum = 0;
int numBodies = m_data->m_narrowphase->getNumRigidBodies();
if (!numBodies)
@@ -532,34 +507,35 @@ void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
if (gCalcWorldSpaceAabbOnCpu)
{
if (numBodies)
{
if (gUseDbvt)
{
m_data->m_allAabbsCPU.resize(numBodies);
m_data->m_narrowphase->readbackAllBodiesToCpu();
for (int i=0;i<numBodies;i++)
for (int i = 0; i < numBodies; i++)
{
b3ComputeWorldAabb( i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_allAabbsCPU[0]);
b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_allAabbsCPU[0]);
}
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
} else
}
else
{
m_data->m_broadphaseSap->getAllAabbsCPU().resize(numBodies);
m_data->m_narrowphase->readbackAllBodiesToCpu();
for (int i=0;i<numBodies;i++)
for (int i = 0; i < numBodies; i++)
{
b3ComputeWorldAabb( i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_broadphaseSap->getAllAabbsCPU()[0]);
b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_broadphaseSap->getAllAabbsCPU()[0]);
}
m_data->m_broadphaseSap->getAllAabbsGPU().copyFromHost(m_data->m_broadphaseSap->getAllAabbsCPU());
//m_data->m_broadphaseSap->writeAabbsToGpu();
}
}
} else
}
else
{
//__kernel void initializeGpuAabbsFull( const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)
b3LauncherCL launcher(m_data->m_queue,m_data->m_updateAabbsKernel,"m_updateAabbsKernel");
b3LauncherCL launcher(m_data->m_queue, m_data->m_updateAabbsKernel, "m_updateAabbsKernel");
launcher.setConst(numBodies);
cl_mem bodies = m_data->m_narrowphase->getBodiesGpu();
launcher.setBuffer(bodies);
@@ -568,17 +544,18 @@ void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
cl_mem localAabbs = m_data->m_narrowphase->getAabbLocalSpaceBufferGpu();
launcher.setBuffer(localAabbs);
cl_mem worldAabbs =0;
cl_mem worldAabbs = 0;
if (gUseDbvt)
{
worldAabbs = m_data->m_allAabbsGPU->getBufferCL();
} else
}
else
{
worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS();
}
launcher.setBuffer(worldAabbs);
launcher.launch1D(numBodies);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
@@ -595,78 +572,68 @@ void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
};
*/
}
cl_mem b3GpuRigidBodyPipeline::getBodyBuffer()
cl_mem b3GpuRigidBodyPipeline::getBodyBuffer()
{
return m_data->m_narrowphase->getBodiesGpu();
}
int b3GpuRigidBodyPipeline::getNumBodies() const
int b3GpuRigidBodyPipeline::getNumBodies() const
{
return m_data->m_narrowphase->getNumRigidBodies();
}
void b3GpuRigidBodyPipeline::setGravity(const float* grav)
void b3GpuRigidBodyPipeline::setGravity(const float* grav)
{
m_data->m_gravity.setValue(grav[0],grav[1],grav[2]);
m_data->m_gravity.setValue(grav[0], grav[1], grav[2]);
}
void b3GpuRigidBodyPipeline::copyConstraintsToHost()
void b3GpuRigidBodyPipeline::copyConstraintsToHost()
{
m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
}
void b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
void b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
{
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
}
int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
{
b3Vector3 aabbMin=b3MakeVector3(0,0,0),aabbMax=b3MakeVector3(0,0,0);
b3Vector3 aabbMin = b3MakeVector3(0, 0, 0), aabbMax = b3MakeVector3(0, 0, 0);
if (collidableIndex>=0)
if (collidableIndex >= 0)
{
b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex);
b3Vector3 localAabbMin=b3MakeVector3(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]);
b3Vector3 localAabbMax=b3MakeVector3(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]);
b3Vector3 localAabbMin = b3MakeVector3(localAabb.m_min[0], localAabb.m_min[1], localAabb.m_min[2]);
b3Vector3 localAabbMax = b3MakeVector3(localAabb.m_max[0], localAabb.m_max[1], localAabb.m_max[2]);
b3Scalar margin = 0.01f;
b3Transform t;
t.setIdentity();
t.setOrigin(b3MakeVector3(position[0],position[1],position[2]));
t.setRotation(b3Quaternion(orientation[0],orientation[1],orientation[2],orientation[3]));
b3TransformAabb(localAabbMin,localAabbMax, margin,t,aabbMin,aabbMax);
} else
t.setOrigin(b3MakeVector3(position[0], position[1], position[2]));
t.setRotation(b3Quaternion(orientation[0], orientation[1], orientation[2], orientation[3]));
b3TransformAabb(localAabbMin, localAabbMax, margin, t, aabbMin, aabbMax);
}
else
{
b3Error("registerPhysicsInstance using invalid collidableIndex\n");
return -1;
}
bool writeToGpu = false;
int bodyIndex = m_data->m_narrowphase->getNumRigidBodies();
bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex,mass,position,orientation,&aabbMin.getX(),&aabbMax.getX(),writeToGpu);
bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex, mass, position, orientation, &aabbMin.getX(), &aabbMax.getX(), writeToGpu);
if (bodyIndex>=0)
if (bodyIndex >= 0)
{
if (gUseDbvt)
{
m_data->m_broadphaseDbvt->createProxy(aabbMin,aabbMax,bodyIndex,0,1,1);
m_data->m_broadphaseDbvt->createProxy(aabbMin, aabbMax, bodyIndex, 0, 1, 1);
b3SapAabb aabb;
for (int i=0;i<3;i++)
for (int i = 0; i < 3; i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
@@ -677,14 +644,16 @@ int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* po
{
m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
}
} else
}
else
{
if (mass)
{
m_data->m_broadphaseSap->createProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher);
} else
m_data->m_broadphaseSap->createProxy(aabbMin, aabbMax, bodyIndex, 1, 1); //m_dispatcher);
}
else
{
m_data->m_broadphaseSap->createLargeProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher);
m_data->m_broadphaseSap->createLargeProxy(aabbMin, aabbMax, bodyIndex, 1, 1); //m_dispatcher);
}
}
}
@@ -699,10 +668,10 @@ int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* po
return bodyIndex;
}
void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults)
void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults)
{
this->m_data->m_raycaster->castRays(rays,hitResults,
getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(),
m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
this->m_data->m_raycaster->castRays(rays, hitResults,
getNumBodies(), this->m_data->m_narrowphase->getBodiesCpu(),
m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
}

View File

@@ -25,50 +25,46 @@ subject to the following restrictions:
class b3GpuRigidBodyPipeline
{
protected:
struct b3GpuRigidBodyPipelineInternalData* m_data;
struct b3GpuRigidBodyPipelineInternalData* m_data;
int allocateCollidable();
public:
b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue q , class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config);
b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config);
virtual ~b3GpuRigidBodyPipeline();
void stepSimulation(float deltaTime);
void integrate(float timeStep);
void setupGpuAabbsFull();
void stepSimulation(float deltaTime);
void integrate(float timeStep);
void setupGpuAabbsFull();
int registerConvexPolyhedron(class b3ConvexUtility* convex);
int registerConvexPolyhedron(class b3ConvexUtility* convex);
//int registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
//int registerSphereShape(float radius);
//int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
//int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
//int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
//if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered
void writeAllInstancesToGpu();
void copyConstraintsToHost();
void setGravity(const float* grav);
void writeAllInstancesToGpu();
void copyConstraintsToHost();
void setGravity(const float* grav);
void reset();
int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold);
int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold);
int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold);
void removeConstraintByUid(int uid);
void addConstraint(class b3TypedConstraint* constraint);
void removeConstraint(b3TypedConstraint* constraint);
void addConstraint(class b3TypedConstraint* constraint);
void removeConstraint(b3TypedConstraint* constraint);
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults);
void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults);
cl_mem getBodyBuffer();
int getNumBodies() const;
cl_mem getBodyBuffer();
int getNumBodies() const;
};
#endif //B3_GPU_RIGIDBODY_PIPELINE_H
#endif //B3_GPU_RIGIDBODY_PIPELINE_H

View File

@@ -22,52 +22,47 @@ subject to the following restrictions:
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"
#include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
#include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
#include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
#include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h"
#include "Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h"
struct b3GpuRigidBodyPipelineInternalData
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_integrateTransformsKernel;
cl_kernel m_updateAabbsKernel;
cl_kernel m_clearOverlappingPairsKernel;
cl_kernel m_integrateTransformsKernel;
cl_kernel m_updateAabbsKernel;
cl_kernel m_clearOverlappingPairsKernel;
class b3PgsJacobiSolver* m_solver;
class b3GpuPgsConstraintSolver* m_gpuSolver;
class b3GpuPgsContactSolver* m_solver2;
class b3GpuJacobiContactSolver* m_solver3;
class b3GpuRaycast* m_raycaster;
class b3GpuBroadphaseInterface* m_broadphaseSap;
struct b3DynamicBvhBroadphase* m_broadphaseDbvt;
b3OpenCLArray<b3SapAabb>* m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU;
b3OpenCLArray<b3SapAabb>* m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU;
b3OpenCLArray<b3GpuGenericConstraint>* m_gpuConstraints;
b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints;
b3AlignedObjectArray<b3TypedConstraint*> m_joints;
int m_constraintUid;
class b3GpuNarrowPhase* m_narrowphase;
b3Vector3 m_gravity;
int m_constraintUid;
class b3GpuNarrowPhase* m_narrowphase;
b3Vector3 m_gravity;
b3Config m_config;
b3Config m_config;
};
#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H

View File

@@ -13,11 +13,9 @@ subject to the following restrictions:
*/
//Originally written by Erwin Coumans
#ifndef B3_GPU_SOLVER_BODY_H
#define B3_GPU_SOLVER_BODY_H
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3Matrix3x3.h"
@@ -27,29 +25,27 @@ subject to the following restrictions:
///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision
#ifdef B3_USE_SSE
#define USE_SIMD 1
#endif //
#endif //
///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody
B3_ATTRIBUTE_ALIGNED16(struct)
b3GpuSolverBody
{
B3_DECLARE_ALIGNED_ALLOCATOR();
// b3Transform m_worldTransformUnused;
b3Vector3 m_deltaLinearVelocity;
b3Vector3 m_deltaAngularVelocity;
b3Vector3 m_angularFactor;
b3Vector3 m_linearFactor;
b3Vector3 m_invMass;
b3Vector3 m_pushVelocity;
b3Vector3 m_turnVelocity;
b3Vector3 m_linearVelocity;
b3Vector3 m_angularVelocity;
// b3Transform m_worldTransformUnused;
b3Vector3 m_deltaLinearVelocity;
b3Vector3 m_deltaAngularVelocity;
b3Vector3 m_angularFactor;
b3Vector3 m_linearFactor;
b3Vector3 m_invMass;
b3Vector3 m_pushVelocity;
b3Vector3 m_turnVelocity;
b3Vector3 m_linearVelocity;
b3Vector3 m_angularVelocity;
union
{
void* m_originalBody;
int m_originalBodyIndex;
union {
void* m_originalBody;
int m_originalBodyIndex;
};
int padding[3];
@@ -65,44 +61,41 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody
return m_worldTransform;
}
*/
B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
{
if (m_originalBody)
velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
else
velocity.setValue(0,0,0);
velocity.setValue(0, 0, 0);
}
B3_FORCE_INLINE void getAngularVelocity(b3Vector3& angVel) const
B3_FORCE_INLINE void getAngularVelocity(b3Vector3 & angVel) const
{
if (m_originalBody)
angVel =m_angularVelocity+m_deltaAngularVelocity;
angVel = m_angularVelocity + m_deltaAngularVelocity;
else
angVel.setValue(0,0,0);
angVel.setValue(0, 0, 0);
}
//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
{
if (m_originalBody)
{
m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
}
}
B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,b3Scalar impulseMagnitude)
B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, b3Scalar impulseMagnitude)
{
if (m_originalBody)
{
m_pushVelocity += linearComponent*impulseMagnitude*m_linearFactor;
m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
m_pushVelocity += linearComponent * impulseMagnitude * m_linearFactor;
m_turnVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
}
}
const b3Vector3& getDeltaLinearVelocity() const
{
return m_deltaLinearVelocity;
@@ -113,20 +106,19 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody
return m_deltaAngularVelocity;
}
const b3Vector3& getPushVelocity() const
const b3Vector3& getPushVelocity() const
{
return m_pushVelocity;
}
const b3Vector3& getTurnVelocity() const
const b3Vector3& getTurnVelocity() const
{
return m_turnVelocity;
}
////////////////////////////////////////////////
///some internal methods, don't use them
b3Vector3& internalGetDeltaLinearVelocity()
{
return m_deltaLinearVelocity;
@@ -151,7 +143,7 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody
{
m_invMass = invMass;
}
b3Vector3& internalGetPushVelocity()
{
return m_pushVelocity;
@@ -162,67 +154,57 @@ B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverBody
return m_turnVelocity;
}
B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
{
velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
}
B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3& angVel) const
B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3 & angVel) const
{
angVel = m_angularVelocity+m_deltaAngularVelocity;
angVel = m_angularVelocity + m_deltaAngularVelocity;
}
//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
{
//if (m_originalBody)
{
m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
}
}
void writebackVelocity()
void writebackVelocity()
{
//if (m_originalBody>=0)
{
m_linearVelocity +=m_deltaLinearVelocity;
m_linearVelocity += m_deltaLinearVelocity;
m_angularVelocity += m_deltaAngularVelocity;
//m_originalBody->setCompanionId(-1);
}
}
void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
{
(void) timeStep;
(void)timeStep;
if (m_originalBody)
{
m_linearVelocity += m_deltaLinearVelocity;
m_angularVelocity += m_deltaAngularVelocity;
//correct the position/orientation based on push/turn recovery
b3Transform newTransform;
if (m_pushVelocity[0]!=0.f || m_pushVelocity[1]!=0 || m_pushVelocity[2]!=0 || m_turnVelocity[0]!=0.f || m_turnVelocity[1]!=0 || m_turnVelocity[2]!=0)
if (m_pushVelocity[0] != 0.f || m_pushVelocity[1] != 0 || m_pushVelocity[2] != 0 || m_turnVelocity[0] != 0.f || m_turnVelocity[1] != 0 || m_turnVelocity[2] != 0)
{
// b3Quaternion orn = m_worldTransform.getRotation();
// b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
// m_worldTransform = newTransform;
// b3Quaternion orn = m_worldTransform.getRotation();
// b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
// m_worldTransform = newTransform;
}
//m_worldTransform.setRotation(orn);
//m_originalBody->setCompanionId(-1);
}
}
};
#endif //B3_SOLVER_BODY_H
#endif //B3_SOLVER_BODY_H

View File

@@ -13,11 +13,9 @@ subject to the following restrictions:
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef B3_GPU_SOLVER_CONSTRAINT_H
#define B3_GPU_SOLVER_CONSTRAINT_H
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3Common/b3Matrix3x3.h"
//#include "b3JacobianEntry.h"
@@ -25,58 +23,51 @@ subject to the following restrictions:
//#define NO_FRICTION_TANGENTIALS 1
///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints.
B3_ATTRIBUTE_ALIGNED16 (struct) b3GpuSolverConstraint
B3_ATTRIBUTE_ALIGNED16(struct)
b3GpuSolverConstraint
{
B3_DECLARE_ALIGNED_ALLOCATOR();
b3Vector3 m_relpos1CrossNormal;
b3Vector3 m_contactNormal;
b3Vector3 m_relpos1CrossNormal;
b3Vector3 m_contactNormal;
b3Vector3 m_relpos2CrossNormal;
b3Vector3 m_relpos2CrossNormal;
//b3Vector3 m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal
b3Vector3 m_angularComponentA;
b3Vector3 m_angularComponentB;
mutable b3Scalar m_appliedPushImpulse;
mutable b3Scalar m_appliedImpulse;
b3Vector3 m_angularComponentA;
b3Vector3 m_angularComponentB;
mutable b3Scalar m_appliedPushImpulse;
mutable b3Scalar m_appliedImpulse;
int m_padding1;
int m_padding2;
b3Scalar m_friction;
b3Scalar m_jacDiagABInv;
b3Scalar m_rhs;
b3Scalar m_cfm;
b3Scalar m_lowerLimit;
b3Scalar m_upperLimit;
b3Scalar m_rhsPenetration;
union
{
void* m_originalContactPoint;
int m_originalConstraintIndex;
b3Scalar m_unusedPadding4;
b3Scalar m_friction;
b3Scalar m_jacDiagABInv;
b3Scalar m_rhs;
b3Scalar m_cfm;
b3Scalar m_lowerLimit;
b3Scalar m_upperLimit;
b3Scalar m_rhsPenetration;
union {
void* m_originalContactPoint;
int m_originalConstraintIndex;
b3Scalar m_unusedPadding4;
};
int m_overrideNumSolverIterations;
int m_frictionIndex;
int m_overrideNumSolverIterations;
int m_frictionIndex;
int m_solverBodyIdA;
int m_solverBodyIdB;
enum b3SolverConstraintType
enum b3SolverConstraintType
{
B3_SOLVER_CONTACT_1D = 0,
B3_SOLVER_FRICTION_1D
};
};
typedef b3AlignedObjectArray<b3GpuSolverConstraint> b3GpuConstraintArray;
#endif //B3_GPU_SOLVER_CONSTRAINT_H
typedef b3AlignedObjectArray<b3GpuSolverConstraint> b3GpuConstraintArray;
#endif //B3_GPU_SOLVER_CONSTRAINT_H

File diff suppressed because it is too large Load Diff

View File

@@ -13,7 +13,6 @@ subject to the following restrictions:
*/
//Originally written by Takahiro Harada
#ifndef __ADL_SOLVER_H
#define __ADL_SOLVER_H
@@ -29,98 +28,83 @@ subject to the following restrictions:
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#define B3NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
#define B3NEXTMULTIPLEOF(num, alignment) (((num) / (alignment) + (((num) % (alignment) == 0) ? 0 : 1)) * (alignment))
enum
{
B3_SOLVER_N_SPLIT_X = 8,//16,//4,
B3_SOLVER_N_SPLIT_Y = 4,//16,//4,
B3_SOLVER_N_SPLIT_Z = 8,//,
B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X*B3_SOLVER_N_SPLIT_Y*B3_SOLVER_N_SPLIT_Z,
B3_SOLVER_N_BATCHES = 8,//4,//8,//4,
B3_SOLVER_N_SPLIT_X = 8, //16,//4,
B3_SOLVER_N_SPLIT_Y = 4, //16,//4,
B3_SOLVER_N_SPLIT_Z = 8, //,
B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X * B3_SOLVER_N_SPLIT_Y * B3_SOLVER_N_SPLIT_Z,
B3_SOLVER_N_BATCHES = 8, //4,//8,//4,
B3_MAX_NUM_BATCHES = 128,
};
class b3SolverBase
{
public:
public:
struct ConstraintCfg
{
ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(-1) {}
struct ConstraintCfg
{
ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {}
float m_positionDrift;
float m_positionConstraintCoeff;
float m_dt;
bool m_enableParallelSolve;
float m_batchCellSize;
int m_staticIdx;
};
float m_positionDrift;
float m_positionConstraintCoeff;
float m_dt;
bool m_enableParallelSolve;
float m_batchCellSize;
int m_staticIdx;
};
};
class b3Solver : public b3SolverBase
{
public:
public:
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
b3OpenCLArray<unsigned int>* m_numConstraints;
b3OpenCLArray<unsigned int>* m_offsets;
b3OpenCLArray<int> m_batchSizes;
b3OpenCLArray<unsigned int>* m_numConstraints;
b3OpenCLArray<unsigned int>* m_offsets;
b3OpenCLArray<int> m_batchSizes;
int m_nIterations;
cl_kernel m_batchingKernel;
cl_kernel m_batchingKernelNew;
cl_kernel m_solveContactKernel;
cl_kernel m_solveFrictionKernel;
cl_kernel m_contactToConstraintKernel;
cl_kernel m_setSortDataKernel;
cl_kernel m_reorderContactKernel;
cl_kernel m_copyConstraintKernel;
int m_nIterations;
cl_kernel m_batchingKernel;
cl_kernel m_batchingKernelNew;
cl_kernel m_solveContactKernel;
cl_kernel m_solveFrictionKernel;
cl_kernel m_contactToConstraintKernel;
cl_kernel m_setSortDataKernel;
cl_kernel m_reorderContactKernel;
cl_kernel m_copyConstraintKernel;
class b3RadixSort32CL* m_sort32;
class b3BoundSearchCL* m_search;
class b3PrefixScanCL* m_scan;
class b3RadixSort32CL* m_sort32;
class b3BoundSearchCL* m_search;
class b3PrefixScanCL* m_scan;
b3OpenCLArray<b3SortData>* m_sortDataBuffer;
b3OpenCLArray<b3Contact4>* m_contactBuffer2;
b3OpenCLArray<b3SortData>* m_sortDataBuffer;
b3OpenCLArray<b3Contact4>* m_contactBuffer2;
enum
{
DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
};
enum
{
DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
};
b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
virtual ~b3Solver();
virtual ~b3Solver();
void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches);
void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches);
void solveContactConstraintHost( b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, b3AlignedObjectArray<int>* batchSizes);
void solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes);
void convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
int nContacts, const ConstraintCfg& cfg);
void convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
const b3OpenCLArray<b3InertiaData>* shapeBuf,
b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
int nContacts, const ConstraintCfg& cfg );
void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx);
};
#endif //__ADL_SOLVER_H
#endif //__ADL_SOLVER_H

View File

@@ -1,388 +1,387 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* batchingKernelsCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"#ifndef B3_CONTACT4DATA_H\n"
"#define B3_CONTACT4DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"typedef struct b3Contact4Data b3Contact4Data_t;\n"
"struct b3Contact4Data\n"
"{\n"
" b3Float4 m_worldPosB[4];\n"
"// b3Float4 m_localPosA[4];\n"
"// b3Float4 m_localPosB[4];\n"
" b3Float4 m_worldNormalOnB; // w: m_nPoints\n"
" unsigned short m_restituitionCoeffCmp;\n"
" unsigned short m_frictionCoeffCmp;\n"
" int m_batchIdx;\n"
" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
" int m_bodyBPtrAndSignBit;\n"
" int m_childIndexA;\n"
" int m_childIndexB;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"};\n"
"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
"{\n"
" return (int)contact->m_worldNormalOnB.w;\n"
"};\n"
"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
"{\n"
" contact->m_worldNormalOnB.w = (float)numPoints;\n"
"};\n"
"#endif //B3_CONTACT4DATA_H\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile __global int*\n"
"#endif\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"#define WG_SIZE 64\n"
"typedef struct \n"
"{\n"
" int m_n;\n"
" int m_start;\n"
" int m_staticIdx;\n"
" int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct \n"
"{\n"
" int m_a;\n"
" int m_b;\n"
" u32 m_idx;\n"
"}Elem;\n"
"#define STACK_SIZE (WG_SIZE*10)\n"
"//#define STACK_SIZE (WG_SIZE)\n"
"#define RING_SIZE 1024\n"
"#define RING_SIZE_MASK (RING_SIZE-1)\n"
"#define CHECK_SIZE (WG_SIZE)\n"
"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
"#define RING_END ldsTmp\n"
"u32 readBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" return buff[bufIdx] & (1<<bitIdx);\n"
"}\n"
"void writeBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
"// buff[bufIdx] |= (1<<bitIdx);\n"
" atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
"}\n"
"u32 tryWrite(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
" int m_staticIdx )\n"
"{\n"
" __local u32 ldsStackIdx[STACK_SIZE];\n"
" __local u32 ldsStackEnd;\n"
" __local Elem ldsRingElem[RING_SIZE];\n"
" __local u32 ldsRingEnd;\n"
" __local u32 ldsTmp;\n"
" __local u32 ldsCheckBuffer[CHECK_SIZE];\n"
" __local u32 ldsFixedBuffer[CHECK_SIZE];\n"
" __local u32 ldsGEnd;\n"
" __local u32 ldsDstEnd;\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" \n"
" const int m_n = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsRingEnd = 0;\n"
" ldsGEnd = 0;\n"
" ldsStackEnd = 0;\n"
" ldsDstEnd = m_start;\n"
" }\n"
" \n"
" \n"
" \n"
"// while(1)\n"
"//was 250\n"
" int ie=0;\n"
" int maxBatch = 0;\n"
" for(ie=0; ie<50; ie++)\n"
" {\n"
" ldsFixedBuffer[lIdx] = 0;\n"
" for(int giter=0; giter<4; giter++)\n"
" {\n"
" int ringCap = GET_RING_CAPACITY;\n"
" \n"
" // 1. fill ring\n"
" if( ldsGEnd < m_n )\n"
" {\n"
" while( ringCap > WG_SIZE )\n"
" {\n"
" if( ldsGEnd >= m_n ) break;\n"
" if( lIdx < ringCap - WG_SIZE )\n"
" {\n"
" int srcIdx;\n"
" AtomInc1( ldsGEnd, srcIdx );\n"
" if( srcIdx < m_n )\n"
" {\n"
" int dstIdx;\n"
" AtomInc1( ldsRingEnd, dstIdx );\n"
" \n"
" int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n"
" int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n"
" ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
" ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
" ldsRingElem[dstIdx].m_idx = srcIdx;\n"
" }\n"
" }\n"
" ringCap = GET_RING_CAPACITY;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" // 2. fill stack\n"
" __local Elem* dst = ldsRingElem;\n"
" if( lIdx == 0 ) RING_END = 0;\n"
" int srcIdx=lIdx;\n"
" int end = ldsRingEnd;\n"
" {\n"
" for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
" {\n"
" Elem e;\n"
" if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
" bool done = (srcIdx<end)?false:true;\n"
" for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
" \n"
" if( !done )\n"
" {\n"
" int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
" int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
" if( aUsed==0 && bUsed==0 )\n"
" {\n"
" int aAvailable=1;\n"
" int bAvailable=1;\n"
" int ea = abs(e.m_a);\n"
" int eb = abs(e.m_b);\n"
" bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
" bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
" \n"
" if (!aStatic)\n"
" aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
" if (!bStatic)\n"
" bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
" \n"
" //aAvailable = aStatic? 1: aAvailable;\n"
" //bAvailable = bStatic? 1: bAvailable;\n"
" bool success = (aAvailable && bAvailable);\n"
" if(success)\n"
" {\n"
" \n"
" if (!aStatic)\n"
" writeBuf( ldsFixedBuffer, ea );\n"
" if (!bStatic)\n"
" writeBuf( ldsFixedBuffer, eb );\n"
" }\n"
" done = success;\n"
" }\n"
" }\n"
" // put it aside\n"
" if(srcIdx<end)\n"
" {\n"
" if( done )\n"
" {\n"
" int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
" if( dstIdx < STACK_SIZE )\n"
" ldsStackIdx[dstIdx] = e.m_idx;\n"
" else{\n"
" done = false;\n"
" AtomAdd( ldsStackEnd, -1 );\n"
" }\n"
" }\n"
" if( !done )\n"
" {\n"
" int dstIdx; AtomInc1( RING_END, dstIdx );\n"
" dst[dstIdx] = e;\n"
" }\n"
" }\n"
" // if filled, flush\n"
" if( ldsStackEnd == STACK_SIZE )\n"
" {\n"
" for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsStackIdx[i];\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
" }\n"
" if( lIdx == 0 ) ldsStackEnd = 0;\n"
" //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
" ldsFixedBuffer[lIdx] = 0;\n"
" }\n"
" }\n"
" }\n"
" if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsStackIdx[i];\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
" }\n"
" // in case it couldn't consume any pair. Flush them\n"
" // todo. Serial batch worth while?\n"
" if( ldsStackEnd == 0 )\n"
" {\n"
" for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsRingElem[i].m_idx;\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" int curBatch = 100+i;\n"
" if (maxBatch < curBatch)\n"
" maxBatch = curBatch;\n"
" \n"
" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
" \n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 ) ldsRingEnd = 0;\n"
" }\n"
" if( lIdx == 0 ) ldsStackEnd = 0;\n"
" GROUP_LDS_BARRIER;\n"
" // termination\n"
" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
" break;\n"
" }\n"
" if( lIdx == 0 )\n"
" {\n"
" if (maxBatch < ie)\n"
" maxBatch=ie;\n"
" batchSizes[wgIdx]=maxBatch;\n"
" }\n"
"}\n"
;
static const char* batchingKernelsCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"#ifndef B3_CONTACT4DATA_H\n"
"#define B3_CONTACT4DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"typedef struct b3Contact4Data b3Contact4Data_t;\n"
"struct b3Contact4Data\n"
"{\n"
" b3Float4 m_worldPosB[4];\n"
"// b3Float4 m_localPosA[4];\n"
"// b3Float4 m_localPosB[4];\n"
" b3Float4 m_worldNormalOnB; // w: m_nPoints\n"
" unsigned short m_restituitionCoeffCmp;\n"
" unsigned short m_frictionCoeffCmp;\n"
" int m_batchIdx;\n"
" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
" int m_bodyBPtrAndSignBit;\n"
" int m_childIndexA;\n"
" int m_childIndexB;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"};\n"
"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
"{\n"
" return (int)contact->m_worldNormalOnB.w;\n"
"};\n"
"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
"{\n"
" contact->m_worldNormalOnB.w = (float)numPoints;\n"
"};\n"
"#endif //B3_CONTACT4DATA_H\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile __global int*\n"
"#endif\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"#define WG_SIZE 64\n"
"typedef struct \n"
"{\n"
" int m_n;\n"
" int m_start;\n"
" int m_staticIdx;\n"
" int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct \n"
"{\n"
" int m_a;\n"
" int m_b;\n"
" u32 m_idx;\n"
"}Elem;\n"
"#define STACK_SIZE (WG_SIZE*10)\n"
"//#define STACK_SIZE (WG_SIZE)\n"
"#define RING_SIZE 1024\n"
"#define RING_SIZE_MASK (RING_SIZE-1)\n"
"#define CHECK_SIZE (WG_SIZE)\n"
"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
"#define RING_END ldsTmp\n"
"u32 readBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" return buff[bufIdx] & (1<<bitIdx);\n"
"}\n"
"void writeBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
"// buff[bufIdx] |= (1<<bitIdx);\n"
" atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
"}\n"
"u32 tryWrite(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
" __global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
" int m_staticIdx )\n"
"{\n"
" __local u32 ldsStackIdx[STACK_SIZE];\n"
" __local u32 ldsStackEnd;\n"
" __local Elem ldsRingElem[RING_SIZE];\n"
" __local u32 ldsRingEnd;\n"
" __local u32 ldsTmp;\n"
" __local u32 ldsCheckBuffer[CHECK_SIZE];\n"
" __local u32 ldsFixedBuffer[CHECK_SIZE];\n"
" __local u32 ldsGEnd;\n"
" __local u32 ldsDstEnd;\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" \n"
" const int m_n = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsRingEnd = 0;\n"
" ldsGEnd = 0;\n"
" ldsStackEnd = 0;\n"
" ldsDstEnd = m_start;\n"
" }\n"
" \n"
" \n"
" \n"
"// while(1)\n"
"//was 250\n"
" int ie=0;\n"
" int maxBatch = 0;\n"
" for(ie=0; ie<50; ie++)\n"
" {\n"
" ldsFixedBuffer[lIdx] = 0;\n"
" for(int giter=0; giter<4; giter++)\n"
" {\n"
" int ringCap = GET_RING_CAPACITY;\n"
" \n"
" // 1. fill ring\n"
" if( ldsGEnd < m_n )\n"
" {\n"
" while( ringCap > WG_SIZE )\n"
" {\n"
" if( ldsGEnd >= m_n ) break;\n"
" if( lIdx < ringCap - WG_SIZE )\n"
" {\n"
" int srcIdx;\n"
" AtomInc1( ldsGEnd, srcIdx );\n"
" if( srcIdx < m_n )\n"
" {\n"
" int dstIdx;\n"
" AtomInc1( ldsRingEnd, dstIdx );\n"
" \n"
" int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n"
" int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n"
" ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
" ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
" ldsRingElem[dstIdx].m_idx = srcIdx;\n"
" }\n"
" }\n"
" ringCap = GET_RING_CAPACITY;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" // 2. fill stack\n"
" __local Elem* dst = ldsRingElem;\n"
" if( lIdx == 0 ) RING_END = 0;\n"
" int srcIdx=lIdx;\n"
" int end = ldsRingEnd;\n"
" {\n"
" for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
" {\n"
" Elem e;\n"
" if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
" bool done = (srcIdx<end)?false:true;\n"
" for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
" \n"
" if( !done )\n"
" {\n"
" int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
" int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
" if( aUsed==0 && bUsed==0 )\n"
" {\n"
" int aAvailable=1;\n"
" int bAvailable=1;\n"
" int ea = abs(e.m_a);\n"
" int eb = abs(e.m_b);\n"
" bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
" bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
" \n"
" if (!aStatic)\n"
" aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
" if (!bStatic)\n"
" bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
" \n"
" //aAvailable = aStatic? 1: aAvailable;\n"
" //bAvailable = bStatic? 1: bAvailable;\n"
" bool success = (aAvailable && bAvailable);\n"
" if(success)\n"
" {\n"
" \n"
" if (!aStatic)\n"
" writeBuf( ldsFixedBuffer, ea );\n"
" if (!bStatic)\n"
" writeBuf( ldsFixedBuffer, eb );\n"
" }\n"
" done = success;\n"
" }\n"
" }\n"
" // put it aside\n"
" if(srcIdx<end)\n"
" {\n"
" if( done )\n"
" {\n"
" int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
" if( dstIdx < STACK_SIZE )\n"
" ldsStackIdx[dstIdx] = e.m_idx;\n"
" else{\n"
" done = false;\n"
" AtomAdd( ldsStackEnd, -1 );\n"
" }\n"
" }\n"
" if( !done )\n"
" {\n"
" int dstIdx; AtomInc1( RING_END, dstIdx );\n"
" dst[dstIdx] = e;\n"
" }\n"
" }\n"
" // if filled, flush\n"
" if( ldsStackEnd == STACK_SIZE )\n"
" {\n"
" for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsStackIdx[i];\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
" }\n"
" if( lIdx == 0 ) ldsStackEnd = 0;\n"
" //for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
" ldsFixedBuffer[lIdx] = 0;\n"
" }\n"
" }\n"
" }\n"
" if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsStackIdx[i];\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
" }\n"
" // in case it couldn't consume any pair. Flush them\n"
" // todo. Serial batch worth while?\n"
" if( ldsStackEnd == 0 )\n"
" {\n"
" for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
" {\n"
" int idx = m_start + ldsRingElem[i].m_idx;\n"
" int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
" gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
" int curBatch = 100+i;\n"
" if (maxBatch < curBatch)\n"
" maxBatch = curBatch;\n"
" \n"
" gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
" \n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 ) ldsRingEnd = 0;\n"
" }\n"
" if( lIdx == 0 ) ldsStackEnd = 0;\n"
" GROUP_LDS_BARRIER;\n"
" // termination\n"
" if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
" break;\n"
" }\n"
" if( lIdx == 0 )\n"
" {\n"
" if (maxBatch < ie)\n"
" maxBatch=ie;\n"
" batchSizes[wgIdx]=maxBatch;\n"
" }\n"
"}\n";

View File

@@ -1,291 +1,290 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* batchingKernelsNewCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#ifndef B3_CONTACT4DATA_H\n"
"#define B3_CONTACT4DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"typedef struct b3Contact4Data b3Contact4Data_t;\n"
"struct b3Contact4Data\n"
"{\n"
" b3Float4 m_worldPosB[4];\n"
"// b3Float4 m_localPosA[4];\n"
"// b3Float4 m_localPosB[4];\n"
" b3Float4 m_worldNormalOnB; // w: m_nPoints\n"
" unsigned short m_restituitionCoeffCmp;\n"
" unsigned short m_frictionCoeffCmp;\n"
" int m_batchIdx;\n"
" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
" int m_bodyBPtrAndSignBit;\n"
" int m_childIndexA;\n"
" int m_childIndexB;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"};\n"
"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
"{\n"
" return (int)contact->m_worldNormalOnB.w;\n"
"};\n"
"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
"{\n"
" contact->m_worldNormalOnB.w = (float)numPoints;\n"
"};\n"
"#endif //B3_CONTACT4DATA_H\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile __global int*\n"
"#endif\n"
"#define SIMD_WIDTH 64\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"#define WG_SIZE 64\n"
"typedef struct \n"
"{\n"
" int m_n;\n"
" int m_start;\n"
" int m_staticIdx;\n"
" int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct \n"
"{\n"
" int m_a;\n"
" int m_b;\n"
" u32 m_idx;\n"
"}Elem;\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" \n"
" const int m_n = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" for (int i=0;i<m_n;i++)\n"
" {\n"
" int srcIdx = i+m_start;\n"
" int batchIndex = i;\n"
" gConstraints[ srcIdx ].m_batchIdx = batchIndex; \n"
" }\n"
" }\n"
"}\n"
"#define CHECK_SIZE (WG_SIZE)\n"
"u32 readBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" return buff[bufIdx] & (1<<bitIdx);\n"
"}\n"
"void writeBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" buff[bufIdx] |= (1<<bitIdx);\n"
" //atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
"}\n"
"u32 tryWrite(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" const int numConstraints = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" b3Contact4Data_t tmp;\n"
" \n"
" __local u32 ldsFixedBuffer[CHECK_SIZE];\n"
" \n"
" \n"
" \n"
" \n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" \n"
" \n"
" __global struct b3Contact4Data* cs = &gConstraints[m_start]; \n"
" \n"
" \n"
" int numValidConstraints = 0;\n"
" int batchIdx = 0;\n"
" while( numValidConstraints < numConstraints)\n"
" {\n"
" int nCurrentBatch = 0;\n"
" // clear flag\n"
" \n"
" for(int i=0; i<CHECK_SIZE; i++) \n"
" ldsFixedBuffer[i] = 0; \n"
" for(int i=numValidConstraints; i<numConstraints; i++)\n"
" {\n"
" int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
" int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
" int bodyA = abs(bodyAS);\n"
" int bodyB = abs(bodyBS);\n"
" bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n"
" bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n"
" int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n"
" int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n"
" \n"
" if( aUnavailable==0 && bUnavailable==0 ) // ok\n"
" {\n"
" if (!aIsStatic)\n"
" {\n"
" writeBuf( ldsFixedBuffer, bodyA );\n"
" }\n"
" if (!bIsStatic)\n"
" {\n"
" writeBuf( ldsFixedBuffer, bodyB );\n"
" }\n"
" cs[i].m_batchIdx = batchIdx;\n"
" if (i!=numValidConstraints)\n"
" {\n"
" tmp = cs[i];\n"
" cs[i] = cs[numValidConstraints];\n"
" cs[numValidConstraints] = tmp;\n"
" }\n"
" numValidConstraints++;\n"
" \n"
" nCurrentBatch++;\n"
" if( nCurrentBatch == SIMD_WIDTH)\n"
" {\n"
" nCurrentBatch = 0;\n"
" for(int i=0; i<CHECK_SIZE; i++) \n"
" ldsFixedBuffer[i] = 0;\n"
" \n"
" }\n"
" }\n"
" }//for\n"
" batchIdx ++;\n"
" }//while\n"
" \n"
" batchSizes[wgIdx] = batchIdx;\n"
" }//if( lIdx == 0 )\n"
" \n"
" //return batchIdx;\n"
"}\n"
;
static const char* batchingKernelsNewCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#ifndef B3_CONTACT4DATA_H\n"
"#define B3_CONTACT4DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"typedef struct b3Contact4Data b3Contact4Data_t;\n"
"struct b3Contact4Data\n"
"{\n"
" b3Float4 m_worldPosB[4];\n"
"// b3Float4 m_localPosA[4];\n"
"// b3Float4 m_localPosB[4];\n"
" b3Float4 m_worldNormalOnB; // w: m_nPoints\n"
" unsigned short m_restituitionCoeffCmp;\n"
" unsigned short m_frictionCoeffCmp;\n"
" int m_batchIdx;\n"
" int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
" int m_bodyBPtrAndSignBit;\n"
" int m_childIndexA;\n"
" int m_childIndexB;\n"
" int m_unused1;\n"
" int m_unused2;\n"
"};\n"
"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
"{\n"
" return (int)contact->m_worldNormalOnB.w;\n"
"};\n"
"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
"{\n"
" contact->m_worldNormalOnB.w = (float)numPoints;\n"
"};\n"
"#endif //B3_CONTACT4DATA_H\n"
"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile __global int*\n"
"#endif\n"
"#define SIMD_WIDTH 64\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define make_float4 (float4)\n"
"#define make_float2 (float2)\n"
"#define make_uint4 (uint4)\n"
"#define make_int4 (int4)\n"
"#define make_uint2 (uint2)\n"
"#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"#define WG_SIZE 64\n"
"typedef struct \n"
"{\n"
" int m_n;\n"
" int m_start;\n"
" int m_staticIdx;\n"
" int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct \n"
"{\n"
" int m_a;\n"
" int m_b;\n"
" u32 m_idx;\n"
"}Elem;\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" \n"
" const int m_n = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" for (int i=0;i<m_n;i++)\n"
" {\n"
" int srcIdx = i+m_start;\n"
" int batchIndex = i;\n"
" gConstraints[ srcIdx ].m_batchIdx = batchIndex; \n"
" }\n"
" }\n"
"}\n"
"#define CHECK_SIZE (WG_SIZE)\n"
"u32 readBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" return buff[bufIdx] & (1<<bitIdx);\n"
"}\n"
"void writeBuf(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" buff[bufIdx] |= (1<<bitIdx);\n"
" //atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
"}\n"
"u32 tryWrite(__local u32* buff, int idx)\n"
"{\n"
" idx = idx % (32*CHECK_SIZE);\n"
" int bitIdx = idx%32;\n"
" int bufIdx = idx/32;\n"
" u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
" return ((ans >> bitIdx)&1) == 0;\n"
"}\n"
"// batching on the GPU\n"
"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
"{\n"
" int wgIdx = GET_GROUP_IDX;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" const int numConstraints = gN[wgIdx];\n"
" const int m_start = gStart[wgIdx];\n"
" b3Contact4Data_t tmp;\n"
" \n"
" __local u32 ldsFixedBuffer[CHECK_SIZE];\n"
" \n"
" \n"
" \n"
" \n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" \n"
" \n"
" __global struct b3Contact4Data* cs = &gConstraints[m_start]; \n"
" \n"
" \n"
" int numValidConstraints = 0;\n"
" int batchIdx = 0;\n"
" while( numValidConstraints < numConstraints)\n"
" {\n"
" int nCurrentBatch = 0;\n"
" // clear flag\n"
" \n"
" for(int i=0; i<CHECK_SIZE; i++) \n"
" ldsFixedBuffer[i] = 0; \n"
" for(int i=numValidConstraints; i<numConstraints; i++)\n"
" {\n"
" int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
" int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
" int bodyA = abs(bodyAS);\n"
" int bodyB = abs(bodyBS);\n"
" bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n"
" bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n"
" int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n"
" int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n"
" \n"
" if( aUnavailable==0 && bUnavailable==0 ) // ok\n"
" {\n"
" if (!aIsStatic)\n"
" {\n"
" writeBuf( ldsFixedBuffer, bodyA );\n"
" }\n"
" if (!bIsStatic)\n"
" {\n"
" writeBuf( ldsFixedBuffer, bodyB );\n"
" }\n"
" cs[i].m_batchIdx = batchIdx;\n"
" if (i!=numValidConstraints)\n"
" {\n"
" tmp = cs[i];\n"
" cs[i] = cs[numValidConstraints];\n"
" cs[numValidConstraints] = tmp;\n"
" }\n"
" numValidConstraints++;\n"
" \n"
" nCurrentBatch++;\n"
" if( nCurrentBatch == SIMD_WIDTH)\n"
" {\n"
" nCurrentBatch = 0;\n"
" for(int i=0; i<CHECK_SIZE; i++) \n"
" ldsFixedBuffer[i] = 0;\n"
" \n"
" }\n"
" }\n"
" }//for\n"
" batchIdx ++;\n"
" }//while\n"
" \n"
" batchSizes[wgIdx] = batchIdx;\n"
" }//if( lIdx == 0 )\n"
" \n"
" //return batchIdx;\n"
"}\n";

View File

@@ -1,433 +1,432 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* integrateKernelCL= \
"/*\n"
"Copyright (c) 2013 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#ifndef B3_RIGIDBODY_DATA_H\n"
"#define B3_RIGIDBODY_DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_QUAT_H\n"
"#define B3_QUAT_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif\n"
"#endif\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Quat;\n"
" #define b3QuatConstArg const b3Quat\n"
" \n"
" \n"
"inline float4 b3FastNormalize4(float4 v)\n"
"{\n"
" v = (float4)(v.xyz,0.f);\n"
" return fast_normalize(v);\n"
"}\n"
" \n"
"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
"{\n"
" b3Quat ans;\n"
" ans = b3Cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
"{\n"
" b3Quat q;\n"
" q=in;\n"
" //return b3FastNormalize4(in);\n"
" float len = native_sqrt(dot(q, q));\n"
" if(len > 0.f)\n"
" {\n"
" q *= 1.f / len;\n"
" }\n"
" else\n"
" {\n"
" q.x = q.y = q.z = 0.f;\n"
" q.w = 1.f;\n"
" }\n"
" return q;\n"
"}\n"
"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" b3Quat qInv = b3QuatInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" return b3QuatRotate( b3QuatInvert( q ), vec );\n"
"}\n"
"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n"
"{\n"
" return b3QuatRotate( orientation, point ) + (translation);\n"
"}\n"
" \n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifndef B3_MAT3x3_H\n"
"#define B3_MAT3x3_H\n"
"#ifndef B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"typedef struct\n"
"{\n"
" b3Float4 m_row[3];\n"
"}b3Mat3x3;\n"
"#define b3Mat3x3ConstArg const b3Mat3x3\n"
"#define b3GetRow(m,row) (m.m_row[row])\n"
"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
"{\n"
" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
" b3Mat3x3 out;\n"
" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
" out.m_row[0].w = 0.f;\n"
" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
" out.m_row[1].w = 0.f;\n"
" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
" out.m_row[2].w = 0.f;\n"
" return out;\n"
"}\n"
"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = fabs(matIn.m_row[0]);\n"
" out.m_row[1] = fabs(matIn.m_row[1]);\n"
" out.m_row[2] = fabs(matIn.m_row[2]);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtZero();\n"
"__inline\n"
"b3Mat3x3 mtIdentity();\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Mat3x3 mtZero()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(0.f);\n"
" m.m_row[1] = (b3Float4)(0.f);\n"
" m.m_row[2] = (b3Float4)(0.f);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtIdentity()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(1,0,0,0);\n"
" m.m_row[1] = (b3Float4)(0,1,0,0);\n"
" m.m_row[2] = (b3Float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
"{\n"
" b3Mat3x3 transB;\n"
" transB = mtTranspose( b );\n"
" b3Mat3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
"{\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a.m_row[0], b );\n"
" ans.y = b3Dot3F4( a.m_row[1], b );\n"
" ans.z = b3Dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
"{\n"
" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a, colx );\n"
" ans.y = b3Dot3F4( a, coly );\n"
" ans.z = b3Dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"#endif\n"
"#endif //B3_MAT3x3_H\n"
"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
"struct b3RigidBodyData\n"
"{\n"
" b3Float4 m_pos;\n"
" b3Quat m_quat;\n"
" b3Float4 m_linVel;\n"
" b3Float4 m_angVel;\n"
" int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"};\n"
"typedef struct b3InertiaData b3InertiaData_t;\n"
"struct b3InertiaData\n"
"{\n"
" b3Mat3x3 m_invInertiaWorld;\n"
" b3Mat3x3 m_initInvInertia;\n"
"};\n"
"#endif //B3_RIGIDBODY_DATA_H\n"
" \n"
"#ifndef B3_RIGIDBODY_DATA_H\n"
"#endif //B3_RIGIDBODY_DATA_H\n"
" \n"
"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
"{\n"
" \n"
" if (bodies[nodeID].m_invMass != 0.f)\n"
" {\n"
" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
" //angular velocity\n"
" {\n"
" b3Float4 axis;\n"
" //add some hardcoded angular damping\n"
" bodies[nodeID].m_angVel.x *= angularDamping;\n"
" bodies[nodeID].m_angVel.y *= angularDamping;\n"
" bodies[nodeID].m_angVel.z *= angularDamping;\n"
" \n"
" b3Float4 angvel = bodies[nodeID].m_angVel;\n"
" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
" \n"
" //limit the angular motion\n"
" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
" {\n"
" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
" }\n"
" if(fAngle < 0.001f)\n"
" {\n"
" // use Taylor's expansions of sync function\n"
" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
" }\n"
" else\n"
" {\n"
" // sync(fAngle) = sin(c*fAngle)/t\n"
" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
" }\n"
" \n"
" b3Quat dorn;\n"
" dorn.x = axis.x;\n"
" dorn.y = axis.y;\n"
" dorn.z = axis.z;\n"
" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
" b3Quat orn0 = bodies[nodeID].m_quat;\n"
" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
" predictedOrn = b3QuatNormalized(predictedOrn);\n"
" bodies[nodeID].m_quat=predictedOrn;\n"
" }\n"
" //linear velocity \n"
" bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;\n"
" \n"
" //apply gravity\n"
" bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n"
" \n"
" }\n"
" \n"
"}\n"
"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
"{\n"
" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
" \n"
" if( (body->m_invMass != 0.f))\n"
" {\n"
" //angular velocity\n"
" {\n"
" b3Float4 axis;\n"
" //add some hardcoded angular damping\n"
" body->m_angVel.x *= angularDamping;\n"
" body->m_angVel.y *= angularDamping;\n"
" body->m_angVel.z *= angularDamping;\n"
" \n"
" b3Float4 angvel = body->m_angVel;\n"
" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
" //limit the angular motion\n"
" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
" {\n"
" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
" }\n"
" if(fAngle < 0.001f)\n"
" {\n"
" // use Taylor's expansions of sync function\n"
" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
" }\n"
" else\n"
" {\n"
" // sync(fAngle) = sin(c*fAngle)/t\n"
" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
" }\n"
" b3Quat dorn;\n"
" dorn.x = axis.x;\n"
" dorn.y = axis.y;\n"
" dorn.z = axis.z;\n"
" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
" b3Quat orn0 = body->m_quat;\n"
" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
" predictedOrn = b3QuatNormalized(predictedOrn);\n"
" body->m_quat=predictedOrn;\n"
" }\n"
" //apply gravity\n"
" body->m_linVel += gravityAcceleration * timeStep;\n"
" //linear velocity \n"
" body->m_pos += body->m_linVel * timeStep;\n"
" \n"
" }\n"
" \n"
"}\n"
"__kernel void \n"
" integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" \n"
" if( nodeID < numNodes)\n"
" {\n"
" integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n"
" }\n"
"}\n"
;
static const char* integrateKernelCL =
"/*\n"
"Copyright (c) 2013 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"#ifndef B3_RIGIDBODY_DATA_H\n"
"#define B3_RIGIDBODY_DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_QUAT_H\n"
"#define B3_QUAT_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif\n"
"#endif\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Quat;\n"
" #define b3QuatConstArg const b3Quat\n"
" \n"
" \n"
"inline float4 b3FastNormalize4(float4 v)\n"
"{\n"
" v = (float4)(v.xyz,0.f);\n"
" return fast_normalize(v);\n"
"}\n"
" \n"
"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
"{\n"
" b3Quat ans;\n"
" ans = b3Cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
"{\n"
" b3Quat q;\n"
" q=in;\n"
" //return b3FastNormalize4(in);\n"
" float len = native_sqrt(dot(q, q));\n"
" if(len > 0.f)\n"
" {\n"
" q *= 1.f / len;\n"
" }\n"
" else\n"
" {\n"
" q.x = q.y = q.z = 0.f;\n"
" q.w = 1.f;\n"
" }\n"
" return q;\n"
"}\n"
"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" b3Quat qInv = b3QuatInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" return b3QuatRotate( b3QuatInvert( q ), vec );\n"
"}\n"
"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n"
"{\n"
" return b3QuatRotate( orientation, point ) + (translation);\n"
"}\n"
" \n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifndef B3_MAT3x3_H\n"
"#define B3_MAT3x3_H\n"
"#ifndef B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"typedef struct\n"
"{\n"
" b3Float4 m_row[3];\n"
"}b3Mat3x3;\n"
"#define b3Mat3x3ConstArg const b3Mat3x3\n"
"#define b3GetRow(m,row) (m.m_row[row])\n"
"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
"{\n"
" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
" b3Mat3x3 out;\n"
" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
" out.m_row[0].w = 0.f;\n"
" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
" out.m_row[1].w = 0.f;\n"
" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
" out.m_row[2].w = 0.f;\n"
" return out;\n"
"}\n"
"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = fabs(matIn.m_row[0]);\n"
" out.m_row[1] = fabs(matIn.m_row[1]);\n"
" out.m_row[2] = fabs(matIn.m_row[2]);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtZero();\n"
"__inline\n"
"b3Mat3x3 mtIdentity();\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Mat3x3 mtZero()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(0.f);\n"
" m.m_row[1] = (b3Float4)(0.f);\n"
" m.m_row[2] = (b3Float4)(0.f);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtIdentity()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(1,0,0,0);\n"
" m.m_row[1] = (b3Float4)(0,1,0,0);\n"
" m.m_row[2] = (b3Float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
"{\n"
" b3Mat3x3 transB;\n"
" transB = mtTranspose( b );\n"
" b3Mat3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
"{\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a.m_row[0], b );\n"
" ans.y = b3Dot3F4( a.m_row[1], b );\n"
" ans.z = b3Dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
"{\n"
" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a, colx );\n"
" ans.y = b3Dot3F4( a, coly );\n"
" ans.z = b3Dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"#endif\n"
"#endif //B3_MAT3x3_H\n"
"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
"struct b3RigidBodyData\n"
"{\n"
" b3Float4 m_pos;\n"
" b3Quat m_quat;\n"
" b3Float4 m_linVel;\n"
" b3Float4 m_angVel;\n"
" int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"};\n"
"typedef struct b3InertiaData b3InertiaData_t;\n"
"struct b3InertiaData\n"
"{\n"
" b3Mat3x3 m_invInertiaWorld;\n"
" b3Mat3x3 m_initInvInertia;\n"
"};\n"
"#endif //B3_RIGIDBODY_DATA_H\n"
" \n"
"#ifndef B3_RIGIDBODY_DATA_H\n"
"#endif //B3_RIGIDBODY_DATA_H\n"
" \n"
"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
"{\n"
" \n"
" if (bodies[nodeID].m_invMass != 0.f)\n"
" {\n"
" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
" //angular velocity\n"
" {\n"
" b3Float4 axis;\n"
" //add some hardcoded angular damping\n"
" bodies[nodeID].m_angVel.x *= angularDamping;\n"
" bodies[nodeID].m_angVel.y *= angularDamping;\n"
" bodies[nodeID].m_angVel.z *= angularDamping;\n"
" \n"
" b3Float4 angvel = bodies[nodeID].m_angVel;\n"
" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
" \n"
" //limit the angular motion\n"
" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
" {\n"
" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
" }\n"
" if(fAngle < 0.001f)\n"
" {\n"
" // use Taylor's expansions of sync function\n"
" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
" }\n"
" else\n"
" {\n"
" // sync(fAngle) = sin(c*fAngle)/t\n"
" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
" }\n"
" \n"
" b3Quat dorn;\n"
" dorn.x = axis.x;\n"
" dorn.y = axis.y;\n"
" dorn.z = axis.z;\n"
" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
" b3Quat orn0 = bodies[nodeID].m_quat;\n"
" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
" predictedOrn = b3QuatNormalized(predictedOrn);\n"
" bodies[nodeID].m_quat=predictedOrn;\n"
" }\n"
" //linear velocity \n"
" bodies[nodeID].m_pos += bodies[nodeID].m_linVel * timeStep;\n"
" \n"
" //apply gravity\n"
" bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n"
" \n"
" }\n"
" \n"
"}\n"
"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
"{\n"
" float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
" \n"
" if( (body->m_invMass != 0.f))\n"
" {\n"
" //angular velocity\n"
" {\n"
" b3Float4 axis;\n"
" //add some hardcoded angular damping\n"
" body->m_angVel.x *= angularDamping;\n"
" body->m_angVel.y *= angularDamping;\n"
" body->m_angVel.z *= angularDamping;\n"
" \n"
" b3Float4 angvel = body->m_angVel;\n"
" float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
" //limit the angular motion\n"
" if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
" {\n"
" fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
" }\n"
" if(fAngle < 0.001f)\n"
" {\n"
" // use Taylor's expansions of sync function\n"
" axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
" }\n"
" else\n"
" {\n"
" // sync(fAngle) = sin(c*fAngle)/t\n"
" axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
" }\n"
" b3Quat dorn;\n"
" dorn.x = axis.x;\n"
" dorn.y = axis.y;\n"
" dorn.z = axis.z;\n"
" dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
" b3Quat orn0 = body->m_quat;\n"
" b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
" predictedOrn = b3QuatNormalized(predictedOrn);\n"
" body->m_quat=predictedOrn;\n"
" }\n"
" //apply gravity\n"
" body->m_linVel += gravityAcceleration * timeStep;\n"
" //linear velocity \n"
" body->m_pos += body->m_linVel * timeStep;\n"
" \n"
" }\n"
" \n"
"}\n"
"__kernel void \n"
" integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" \n"
" if( nodeID < numNodes)\n"
" {\n"
" integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n"
" }\n"
"}\n";

File diff suppressed because it is too large Load Diff

View File

@@ -1,393 +1,392 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* solveContactCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define mymake_float4 (float4)\n"
"//#define make_float2 (float2)\n"
"//#define make_uint4 (uint4)\n"
"//#define make_int4 (int4)\n"
"//#define make_uint2 (uint2)\n"
"//#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = mymake_float4(a.xyz,0.f);\n"
" float4 b1 = mymake_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"typedef float4 Quaternion;\n"
"#define WG_SIZE 64\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = mymake_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"void solveContact(__global Constraint4* cs,\n"
" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
"void solveContact(__global Constraint4* cs,\n"
" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
"{\n"
" float minRambdaDt = 0;\n"
" float maxRambdaDt = FLT_MAX;\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = cs->m_worldPos[ic] - posA;\n"
" float4 r1 = cs->m_worldPos[ic] - posB;\n"
" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
" *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
" rambdaDt *= cs->m_jacCoeffInv[ic];\n"
" {\n"
" float prevSum = cs->m_appliedRambdaDt[ic];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt );\n"
" updated = min2( updated, maxRambdaDt );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_appliedRambdaDt[ic] = updated;\n"
" }\n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
" *linVelA += linImp0;\n"
" *angVelA += angImp0;\n"
" *linVelB += linImp1;\n"
" *angVelB += angImp1;\n"
" }\n"
"}\n"
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n[0].z*k;\n"
" p[0].z = n[0].y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n[0].x*p[0].z;\n"
" q[0].z = n[0].x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n[0].y*k;\n"
" p[0].y = n[0].x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n[0].z*p[0].y;\n"
" q[0].y = n[0].z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
"{\n"
" //float frictionCoeff = ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
" posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
" if (gBodies[aIdx].m_invMass)\n"
" {\n"
" gBodies[aIdx].m_linVel = linVelA;\n"
" gBodies[aIdx].m_angVel = angVelA;\n"
" } else\n"
" {\n"
" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" \n"
" }\n"
" if (gBodies[bIdx].m_invMass)\n"
" {\n"
" gBodies[bIdx].m_linVel = linVelB;\n"
" gBodies[bIdx].m_angVel = angVelB;\n"
" } else\n"
" {\n"
" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" \n"
" }\n"
"}\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void BatchSolveKernelContact(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
"{\n"
" //__local int ldsBatchIdx[WG_SIZE+1];\n"
" __local int ldsCurBatch;\n"
" __local int ldsNextBatch;\n"
" __local int ldsStart;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int wgIdx = GET_GROUP_IDX;\n"
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
" \n"
" \n"
" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
" //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
" //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
" //int cellIdx = xIdx+yIdx*nSplit;\n"
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" \n"
" \n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
" \n"
" \n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch = 0;\n"
" ldsNextBatch = 0;\n"
" ldsStart = start;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" int idx=ldsStart+lIdx;\n"
" while (ldsCurBatch < maxBatch)\n"
" {\n"
" for(; idx<end; )\n"
" {\n"
" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
" {\n"
" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" idx+=64;\n"
" } else\n"
" {\n"
" break;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch++;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" \n"
" \n"
"}\n"
"__kernel void solveSingleContactKernel(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" int cellIdx,\n"
" int batchOffset,\n"
" int numConstraintsInBatch\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if (index < numConstraintsInBatch)\n"
" {\n"
" int idx=batchOffset+index;\n"
" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" } \n"
"}\n"
;
static const char* solveContactCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define mymake_float4 (float4)\n"
"//#define make_float2 (float2)\n"
"//#define make_uint4 (uint4)\n"
"//#define make_int4 (int4)\n"
"//#define make_uint2 (uint2)\n"
"//#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = mymake_float4(a.xyz,0.f);\n"
" float4 b1 = mymake_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"typedef float4 Quaternion;\n"
"#define WG_SIZE 64\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = mymake_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"void solveContact(__global Constraint4* cs,\n"
" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
"void solveContact(__global Constraint4* cs,\n"
" float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
" float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
"{\n"
" float minRambdaDt = 0;\n"
" float maxRambdaDt = FLT_MAX;\n"
" for(int ic=0; ic<4; ic++)\n"
" {\n"
" if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = cs->m_worldPos[ic] - posA;\n"
" float4 r1 = cs->m_worldPos[ic] - posB;\n"
" setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
" float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
" *linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
" rambdaDt *= cs->m_jacCoeffInv[ic];\n"
" {\n"
" float prevSum = cs->m_appliedRambdaDt[ic];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt );\n"
" updated = min2( updated, maxRambdaDt );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_appliedRambdaDt[ic] = updated;\n"
" }\n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
" *linVelA += linImp0;\n"
" *angVelA += angImp0;\n"
" *linVelB += linImp1;\n"
" *angVelB += angImp1;\n"
" }\n"
"}\n"
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n[0].z*k;\n"
" p[0].z = n[0].y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n[0].x*p[0].z;\n"
" q[0].z = n[0].x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n[0].y*k;\n"
" p[0].y = n[0].x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n[0].z*p[0].y;\n"
" q[0].y = n[0].z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
"{\n"
" //float frictionCoeff = ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
" solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
" posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
" if (gBodies[aIdx].m_invMass)\n"
" {\n"
" gBodies[aIdx].m_linVel = linVelA;\n"
" gBodies[aIdx].m_angVel = angVelA;\n"
" } else\n"
" {\n"
" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" \n"
" }\n"
" if (gBodies[bIdx].m_invMass)\n"
" {\n"
" gBodies[bIdx].m_linVel = linVelB;\n"
" gBodies[bIdx].m_angVel = angVelB;\n"
" } else\n"
" {\n"
" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" \n"
" }\n"
"}\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void BatchSolveKernelContact(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
"{\n"
" //__local int ldsBatchIdx[WG_SIZE+1];\n"
" __local int ldsCurBatch;\n"
" __local int ldsNextBatch;\n"
" __local int ldsStart;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int wgIdx = GET_GROUP_IDX;\n"
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
" \n"
" \n"
" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
" //int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
" //int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
" //int cellIdx = xIdx+yIdx*nSplit;\n"
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" \n"
" \n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
" \n"
" \n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch = 0;\n"
" ldsNextBatch = 0;\n"
" ldsStart = start;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" int idx=ldsStart+lIdx;\n"
" while (ldsCurBatch < maxBatch)\n"
" {\n"
" for(; idx<end; )\n"
" {\n"
" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
" {\n"
" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" idx+=64;\n"
" } else\n"
" {\n"
" break;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch++;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" \n"
" \n"
"}\n"
"__kernel void solveSingleContactKernel(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" int cellIdx,\n"
" int batchOffset,\n"
" int numConstraintsInBatch\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if (index < numConstraintsInBatch)\n"
" {\n"
" int idx=batchOffset+index;\n"
" solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" } \n"
"}\n";

View File

@@ -1,421 +1,420 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* solveFrictionCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define mymake_float4 (float4)\n"
"//#define make_float2 (float2)\n"
"//#define make_uint4 (uint4)\n"
"//#define make_int4 (int4)\n"
"//#define make_uint2 (uint2)\n"
"//#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = mymake_float4(a.xyz,0.f);\n"
" float4 b1 = mymake_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"typedef float4 Quaternion;\n"
"#define WG_SIZE 64\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = mymake_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n[0].z*k;\n"
" p[0].z = n[0].y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n[0].x*p[0].z;\n"
" q[0].z = n[0].x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n[0].y*k;\n"
" p[0].y = n[0].x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n[0].z*p[0].y;\n"
" q[0].y = n[0].z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
"{\n"
" float frictionCoeff = ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
" \n"
" {\n"
" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
" float sum = 0;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
" }\n"
" frictionCoeff = 0.7f;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" maxRambdaDt[j] = frictionCoeff*sum;\n"
" minRambdaDt[j] = -maxRambdaDt[j];\n"
" }\n"
" \n"
"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
" \n"
" \n"
" {\n"
" \n"
" __global Constraint4* cs = ldsCs;\n"
" \n"
" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
" const float4 center = cs->m_center;\n"
" \n"
" float4 n = -cs->m_linear;\n"
" \n"
" float4 tangent[2];\n"
" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = center - posA;\n"
" float4 r1 = center - posB;\n"
" for(int i=0; i<2; i++)\n"
" {\n"
" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
" linVelA, angVelA, linVelB, angVelB );\n"
" rambdaDt *= cs->m_fJacCoeffInv[i];\n"
" \n"
" {\n"
" float prevSum = cs->m_fAppliedRambdaDt[i];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt[i] );\n"
" updated = min2( updated, maxRambdaDt[i] );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_fAppliedRambdaDt[i] = updated;\n"
" }\n"
" \n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
" \n"
" linVelA += linImp0;\n"
" angVelA += angImp0;\n"
" linVelB += linImp1;\n"
" angVelB += angImp1;\n"
" }\n"
" { // angular damping for point constraint\n"
" float4 ab = normalize3( posB - posA );\n"
" float4 ac = normalize3( center - posA );\n"
" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n"
" {\n"
" float angNA = dot3F4( n, angVelA );\n"
" float angNB = dot3F4( n, angVelB );\n"
" \n"
" angVelA -= (angNA*0.1f)*n;\n"
" angVelB -= (angNB*0.1f)*n;\n"
" }\n"
" }\n"
" }\n"
" \n"
" \n"
" }\n"
" if (gBodies[aIdx].m_invMass)\n"
" {\n"
" gBodies[aIdx].m_linVel = linVelA;\n"
" gBodies[aIdx].m_angVel = angVelA;\n"
" } else\n"
" {\n"
" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" }\n"
" if (gBodies[bIdx].m_invMass)\n"
" {\n"
" gBodies[bIdx].m_linVel = linVelB;\n"
" gBodies[bIdx].m_angVel = angVelB;\n"
" } else\n"
" {\n"
" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" }\n"
" \n"
"}\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void BatchSolveKernelFriction(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
"{\n"
" //__local int ldsBatchIdx[WG_SIZE+1];\n"
" __local int ldsCurBatch;\n"
" __local int ldsNextBatch;\n"
" __local int ldsStart;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int wgIdx = GET_GROUP_IDX;\n"
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch = 0;\n"
" ldsNextBatch = 0;\n"
" ldsStart = start;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" int idx=ldsStart+lIdx;\n"
" while (ldsCurBatch < maxBatch)\n"
" {\n"
" for(; idx<end; )\n"
" {\n"
" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
" {\n"
" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" idx+=64;\n"
" } else\n"
" {\n"
" break;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch++;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" \n"
" \n"
"}\n"
"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" int cellIdx,\n"
" int batchOffset,\n"
" int numConstraintsInBatch\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if (index < numConstraintsInBatch)\n"
" {\n"
" \n"
" int idx=batchOffset+index;\n"
" \n"
" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" } \n"
"}\n"
;
static const char* solveFrictionCL =
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Takahiro Harada\n"
"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
"#ifdef cl_ext_atomic_counters_32\n"
"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
"#else\n"
"#define counter32_t volatile global int*\n"
"#endif\n"
"typedef unsigned int u32;\n"
"typedef unsigned short u16;\n"
"typedef unsigned char u8;\n"
"#define GET_GROUP_IDX get_group_id(0)\n"
"#define GET_LOCAL_IDX get_local_id(0)\n"
"#define GET_GLOBAL_IDX get_global_id(0)\n"
"#define GET_GROUP_SIZE get_local_size(0)\n"
"#define GET_NUM_GROUPS get_num_groups(0)\n"
"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
"#define AtomInc(x) atom_inc(&(x))\n"
"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
"#define AppendInc(x, out) out = atomic_inc(x)\n"
"#define AtomAdd(x, value) atom_add(&(x), value)\n"
"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
"#define mymake_float4 (float4)\n"
"//#define make_float2 (float2)\n"
"//#define make_uint4 (uint4)\n"
"//#define make_int4 (int4)\n"
"//#define make_uint2 (uint2)\n"
"//#define make_int2 (int2)\n"
"#define max2 max\n"
"#define min2 min\n"
"///////////////////////////////////////\n"
"// Vector\n"
"///////////////////////////////////////\n"
"__inline\n"
"float4 fastNormalize4(float4 v)\n"
"{\n"
" return fast_normalize(v);\n"
"}\n"
"__inline\n"
"float4 cross3(float4 a, float4 b)\n"
"{\n"
" return cross(a,b);\n"
"}\n"
"__inline\n"
"float dot3F4(float4 a, float4 b)\n"
"{\n"
" float4 a1 = mymake_float4(a.xyz,0.f);\n"
" float4 b1 = mymake_float4(b.xyz,0.f);\n"
" return dot(a1, b1);\n"
"}\n"
"__inline\n"
"float4 normalize3(const float4 a)\n"
"{\n"
" float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
" return fastNormalize4( n );\n"
"// float length = sqrtf(dot3F4(a, a));\n"
"// return 1.f/length * a;\n"
"}\n"
"///////////////////////////////////////\n"
"// Matrix3x3\n"
"///////////////////////////////////////\n"
"typedef struct\n"
"{\n"
" float4 m_row[3];\n"
"}Matrix3x3;\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b);\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b);\n"
"__inline\n"
"float4 mtMul1(Matrix3x3 a, float4 b)\n"
"{\n"
" float4 ans;\n"
" ans.x = dot3F4( a.m_row[0], b );\n"
" ans.y = dot3F4( a.m_row[1], b );\n"
" ans.z = dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"float4 mtMul3(float4 a, Matrix3x3 b)\n"
"{\n"
" float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" float4 ans;\n"
" ans.x = dot3F4( a, colx );\n"
" ans.y = dot3F4( a, coly );\n"
" ans.z = dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"///////////////////////////////////////\n"
"// Quaternion\n"
"///////////////////////////////////////\n"
"typedef float4 Quaternion;\n"
"#define WG_SIZE 64\n"
"typedef struct\n"
"{\n"
" float4 m_pos;\n"
" Quaternion m_quat;\n"
" float4 m_linVel;\n"
" float4 m_angVel;\n"
" u32 m_shapeIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"} Body;\n"
"typedef struct\n"
"{\n"
" Matrix3x3 m_invInertia;\n"
" Matrix3x3 m_initInvInertia;\n"
"} Shape;\n"
"typedef struct\n"
"{\n"
" float4 m_linear;\n"
" float4 m_worldPos[4];\n"
" float4 m_center; \n"
" float m_jacCoeffInv[4];\n"
" float m_b[4];\n"
" float m_appliedRambdaDt[4];\n"
" float m_fJacCoeffInv[2]; \n"
" float m_fAppliedRambdaDt[2]; \n"
" u32 m_bodyA;\n"
" u32 m_bodyB;\n"
" int m_batchIdx;\n"
" u32 m_paddings[1];\n"
"} Constraint4;\n"
"typedef struct\n"
"{\n"
" int m_nConstraints;\n"
" int m_start;\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBuffer;\n"
"typedef struct\n"
"{\n"
" int m_solveFriction;\n"
" int m_maxBatch; // long batch really kills the performance\n"
" int m_batchIdx;\n"
" int m_nSplit;\n"
"// int m_paddings[1];\n"
"} ConstBufferBatchSolve;\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
"{\n"
" *linear = mymake_float4(-n.xyz,0.f);\n"
" *angular0 = -cross3(r0, n);\n"
" *angular1 = cross3(r1, n);\n"
"}\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
"{\n"
" return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
"}\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
" float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
"{\n"
" // linear0,1 are normlized\n"
" float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
" float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
" float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
" float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
" return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
"}\n"
"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
"{\n"
" if (fabs(n[0].z) > 0.70710678f) {\n"
" // choose p in y-z plane\n"
" float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = 0;\n"
" p[0].y = -n[0].z*k;\n"
" p[0].z = n[0].y*k;\n"
" // set q = n x p\n"
" q[0].x = a*k;\n"
" q[0].y = -n[0].x*p[0].z;\n"
" q[0].z = n[0].x*p[0].y;\n"
" }\n"
" else {\n"
" // choose p in x-y plane\n"
" float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
" float k = 1.f/sqrt(a);\n"
" p[0].x = -n[0].y*k;\n"
" p[0].y = n[0].x*k;\n"
" p[0].z = 0;\n"
" // set q = n x p\n"
" q[0].x = -n[0].z*p[0].y;\n"
" q[0].y = n[0].z*p[0].x;\n"
" q[0].z = a*k;\n"
" }\n"
"}\n"
"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
"{\n"
" float frictionCoeff = ldsCs[0].m_linear.w;\n"
" int aIdx = ldsCs[0].m_bodyA;\n"
" int bIdx = ldsCs[0].m_bodyB;\n"
" float4 posA = gBodies[aIdx].m_pos;\n"
" float4 linVelA = gBodies[aIdx].m_linVel;\n"
" float4 angVelA = gBodies[aIdx].m_angVel;\n"
" float invMassA = gBodies[aIdx].m_invMass;\n"
" Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
" float4 posB = gBodies[bIdx].m_pos;\n"
" float4 linVelB = gBodies[bIdx].m_linVel;\n"
" float4 angVelB = gBodies[bIdx].m_angVel;\n"
" float invMassB = gBodies[bIdx].m_invMass;\n"
" Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
" \n"
" {\n"
" float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
" float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
" float sum = 0;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
" }\n"
" frictionCoeff = 0.7f;\n"
" for(int j=0; j<4; j++)\n"
" {\n"
" maxRambdaDt[j] = frictionCoeff*sum;\n"
" minRambdaDt[j] = -maxRambdaDt[j];\n"
" }\n"
" \n"
"// solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
"// posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
" \n"
" \n"
" {\n"
" \n"
" __global Constraint4* cs = ldsCs;\n"
" \n"
" if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
" const float4 center = cs->m_center;\n"
" \n"
" float4 n = -cs->m_linear;\n"
" \n"
" float4 tangent[2];\n"
" btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
" float4 angular0, angular1, linear;\n"
" float4 r0 = center - posA;\n"
" float4 r1 = center - posB;\n"
" for(int i=0; i<2; i++)\n"
" {\n"
" setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
" float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
" linVelA, angVelA, linVelB, angVelB );\n"
" rambdaDt *= cs->m_fJacCoeffInv[i];\n"
" \n"
" {\n"
" float prevSum = cs->m_fAppliedRambdaDt[i];\n"
" float updated = prevSum;\n"
" updated += rambdaDt;\n"
" updated = max2( updated, minRambdaDt[i] );\n"
" updated = min2( updated, maxRambdaDt[i] );\n"
" rambdaDt = updated - prevSum;\n"
" cs->m_fAppliedRambdaDt[i] = updated;\n"
" }\n"
" \n"
" float4 linImp0 = invMassA*linear*rambdaDt;\n"
" float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
" float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
" float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
" \n"
" linVelA += linImp0;\n"
" angVelA += angImp0;\n"
" linVelB += linImp1;\n"
" angVelB += angImp1;\n"
" }\n"
" { // angular damping for point constraint\n"
" float4 ab = normalize3( posB - posA );\n"
" float4 ac = normalize3( center - posA );\n"
" if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))\n"
" {\n"
" float angNA = dot3F4( n, angVelA );\n"
" float angNB = dot3F4( n, angVelB );\n"
" \n"
" angVelA -= (angNA*0.1f)*n;\n"
" angVelB -= (angNB*0.1f)*n;\n"
" }\n"
" }\n"
" }\n"
" \n"
" \n"
" }\n"
" if (gBodies[aIdx].m_invMass)\n"
" {\n"
" gBodies[aIdx].m_linVel = linVelA;\n"
" gBodies[aIdx].m_angVel = angVelA;\n"
" } else\n"
" {\n"
" gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" }\n"
" if (gBodies[bIdx].m_invMass)\n"
" {\n"
" gBodies[bIdx].m_linVel = linVelB;\n"
" gBodies[bIdx].m_angVel = angVelB;\n"
" } else\n"
" {\n"
" gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
" gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
" }\n"
" \n"
"}\n"
"typedef struct \n"
"{\n"
" int m_valInt0;\n"
" int m_valInt1;\n"
" int m_valInt2;\n"
" int m_valInt3;\n"
" float m_val0;\n"
" float m_val1;\n"
" float m_val2;\n"
" float m_val3;\n"
"} SolverDebugInfo;\n"
"__kernel\n"
"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
"void BatchSolveKernelFriction(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" __global int* gN,\n"
" __global int* gOffsets,\n"
" __global int* batchSizes,\n"
" int maxBatch1,\n"
" int cellBatch,\n"
" int4 nSplit\n"
" )\n"
"{\n"
" //__local int ldsBatchIdx[WG_SIZE+1];\n"
" __local int ldsCurBatch;\n"
" __local int ldsNextBatch;\n"
" __local int ldsStart;\n"
" int lIdx = GET_LOCAL_IDX;\n"
" int wgIdx = GET_GROUP_IDX;\n"
"// int gIdx = GET_GLOBAL_IDX;\n"
"// debugInfo[gIdx].m_valInt0 = gIdx;\n"
" //debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
" int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
" int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
" int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
" int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
" int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
" \n"
" if( gN[cellIdx] == 0 ) \n"
" return;\n"
" int maxBatch = batchSizes[cellIdx];\n"
" const int start = gOffsets[cellIdx];\n"
" const int end = start + gN[cellIdx];\n"
" \n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch = 0;\n"
" ldsNextBatch = 0;\n"
" ldsStart = start;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" int idx=ldsStart+lIdx;\n"
" while (ldsCurBatch < maxBatch)\n"
" {\n"
" for(; idx<end; )\n"
" {\n"
" if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
" {\n"
" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" idx+=64;\n"
" } else\n"
" {\n"
" break;\n"
" }\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" if( lIdx == 0 )\n"
" {\n"
" ldsCurBatch++;\n"
" }\n"
" GROUP_LDS_BARRIER;\n"
" }\n"
" \n"
" \n"
"}\n"
"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n"
" __global Shape* gShapes,\n"
" __global Constraint4* gConstraints,\n"
" int cellIdx,\n"
" int batchOffset,\n"
" int numConstraintsInBatch\n"
" )\n"
"{\n"
" int index = get_global_id(0);\n"
" if (index < numConstraintsInBatch)\n"
" {\n"
" \n"
" int idx=batchOffset+index;\n"
" \n"
" solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
" } \n"
"}\n";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,483 +1,482 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* updateAabbsKernelCL= \
"#ifndef B3_UPDATE_AABBS_H\n"
"#define B3_UPDATE_AABBS_H\n"
"#ifndef B3_AABB_H\n"
"#define B3_AABB_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_MAT3x3_H\n"
"#define B3_MAT3x3_H\n"
"#ifndef B3_QUAT_H\n"
"#define B3_QUAT_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif\n"
"#endif\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Quat;\n"
" #define b3QuatConstArg const b3Quat\n"
" \n"
" \n"
"inline float4 b3FastNormalize4(float4 v)\n"
"{\n"
" v = (float4)(v.xyz,0.f);\n"
" return fast_normalize(v);\n"
"}\n"
" \n"
"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
"{\n"
" b3Quat ans;\n"
" ans = b3Cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
"{\n"
" b3Quat q;\n"
" q=in;\n"
" //return b3FastNormalize4(in);\n"
" float len = native_sqrt(dot(q, q));\n"
" if(len > 0.f)\n"
" {\n"
" q *= 1.f / len;\n"
" }\n"
" else\n"
" {\n"
" q.x = q.y = q.z = 0.f;\n"
" q.w = 1.f;\n"
" }\n"
" return q;\n"
"}\n"
"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" b3Quat qInv = b3QuatInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" return b3QuatRotate( b3QuatInvert( q ), vec );\n"
"}\n"
"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n"
"{\n"
" return b3QuatRotate( orientation, point ) + (translation);\n"
"}\n"
" \n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"typedef struct\n"
"{\n"
" b3Float4 m_row[3];\n"
"}b3Mat3x3;\n"
"#define b3Mat3x3ConstArg const b3Mat3x3\n"
"#define b3GetRow(m,row) (m.m_row[row])\n"
"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
"{\n"
" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
" b3Mat3x3 out;\n"
" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
" out.m_row[0].w = 0.f;\n"
" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
" out.m_row[1].w = 0.f;\n"
" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
" out.m_row[2].w = 0.f;\n"
" return out;\n"
"}\n"
"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = fabs(matIn.m_row[0]);\n"
" out.m_row[1] = fabs(matIn.m_row[1]);\n"
" out.m_row[2] = fabs(matIn.m_row[2]);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtZero();\n"
"__inline\n"
"b3Mat3x3 mtIdentity();\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Mat3x3 mtZero()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(0.f);\n"
" m.m_row[1] = (b3Float4)(0.f);\n"
" m.m_row[2] = (b3Float4)(0.f);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtIdentity()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(1,0,0,0);\n"
" m.m_row[1] = (b3Float4)(0,1,0,0);\n"
" m.m_row[2] = (b3Float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
"{\n"
" b3Mat3x3 transB;\n"
" transB = mtTranspose( b );\n"
" b3Mat3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
"{\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a.m_row[0], b );\n"
" ans.y = b3Dot3F4( a.m_row[1], b );\n"
" ans.z = b3Dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
"{\n"
" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a, colx );\n"
" ans.y = b3Dot3F4( a, coly );\n"
" ans.z = b3Dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"#endif\n"
"#endif //B3_MAT3x3_H\n"
"typedef struct b3Aabb b3Aabb_t;\n"
"struct b3Aabb\n"
"{\n"
" union\n"
" {\n"
" float m_min[4];\n"
" b3Float4 m_minVec;\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float m_max[4];\n"
" b3Float4 m_maxVec;\n"
" int m_signedMaxIndices[4];\n"
" };\n"
"};\n"
"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
" b3Float4ConstArg pos,\n"
" b3QuatConstArg orn,\n"
" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
"{\n"
" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
" b3Mat3x3 m;\n"
" m = b3QuatGetRotationMatrix(orn);\n"
" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
" \n"
" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
" 0.f);\n"
" *aabbMinOut = center-extent;\n"
" *aabbMaxOut = center+extent;\n"
"}\n"
"/// conservative test for overlap between two aabbs\n"
"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"#endif //B3_AABB_H\n"
"#ifndef B3_COLLIDABLE_H\n"
"#define B3_COLLIDABLE_H\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"enum b3ShapeTypes\n"
"{\n"
" SHAPE_HEIGHT_FIELD=1,\n"
" SHAPE_CONVEX_HULL=3,\n"
" SHAPE_PLANE=4,\n"
" SHAPE_CONCAVE_TRIMESH=5,\n"
" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n"
" SHAPE_SPHERE=7,\n"
" MAX_NUM_SHAPE_TYPES,\n"
"};\n"
"typedef struct b3Collidable b3Collidable_t;\n"
"struct b3Collidable\n"
"{\n"
" union {\n"
" int m_numChildShapes;\n"
" int m_bvhIndex;\n"
" };\n"
" union\n"
" {\n"
" float m_radius;\n"
" int m_compoundBvhIndex;\n"
" };\n"
" int m_shapeType;\n"
" union\n"
" {\n"
" int m_shapeIndex;\n"
" float m_height;\n"
" };\n"
"};\n"
"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
"struct b3GpuChildShape\n"
"{\n"
" b3Float4 m_childPosition;\n"
" b3Quat m_childOrientation;\n"
" union\n"
" {\n"
" int m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
" int m_capsuleAxis;\n"
" };\n"
" union \n"
" {\n"
" float m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n"
" int m_numChildShapes;//used for compound shape\n"
" };\n"
" union \n"
" {\n"
" float m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n"
" int m_collidableShapeIndex;\n"
" };\n"
" int m_shapeType;\n"
"};\n"
"struct b3CompoundOverlappingPair\n"
"{\n"
" int m_bodyIndexA;\n"
" int m_bodyIndexB;\n"
"// int m_pairType;\n"
" int m_childShapeIndexA;\n"
" int m_childShapeIndexB;\n"
"};\n"
"#endif //B3_COLLIDABLE_H\n"
"#ifndef B3_RIGIDBODY_DATA_H\n"
"#define B3_RIGIDBODY_DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifndef B3_MAT3x3_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif\n"
"#endif //B3_MAT3x3_H\n"
"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
"struct b3RigidBodyData\n"
"{\n"
" b3Float4 m_pos;\n"
" b3Quat m_quat;\n"
" b3Float4 m_linVel;\n"
" b3Float4 m_angVel;\n"
" int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"};\n"
"typedef struct b3InertiaData b3InertiaData_t;\n"
"struct b3InertiaData\n"
"{\n"
" b3Mat3x3 m_invInertiaWorld;\n"
" b3Mat3x3 m_initInvInertia;\n"
"};\n"
"#endif //B3_RIGIDBODY_DATA_H\n"
" \n"
"void b3ComputeWorldAabb( int bodyId, __global const b3RigidBodyData_t* bodies, __global const b3Collidable_t* collidables, __global const b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n"
"{\n"
" __global const b3RigidBodyData_t* body = &bodies[bodyId];\n"
" b3Float4 position = body->m_pos;\n"
" b3Quat orientation = body->m_quat;\n"
" \n"
" int collidableIndex = body->m_collidableIdx;\n"
" int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n"
" \n"
" if (shapeIndex>=0)\n"
" {\n"
" \n"
" b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n"
" b3Aabb_t worldAabb;\n"
" \n"
" b3Float4 aabbAMinOut,aabbAMaxOut; \n"
" float margin = 0.f;\n"
" b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n"
" \n"
" worldAabb.m_minVec =aabbAMinOut;\n"
" worldAabb.m_minIndices[3] = bodyId;\n"
" worldAabb.m_maxVec = aabbAMaxOut;\n"
" worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n"
" worldAabbs[bodyId] = worldAabb;\n"
" }\n"
"}\n"
"#endif //B3_UPDATE_AABBS_H\n"
"__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" if( nodeID < numNodes )\n"
" {\n"
" b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n"
" }\n"
"}\n"
"__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs)\n"
"{\n"
" int pairId = get_global_id(0);\n"
" if( pairId< numPairs )\n"
" {\n"
" pairs[pairId].z = 0xffffffff;\n"
" }\n"
"}\n"
;
static const char* updateAabbsKernelCL =
"#ifndef B3_UPDATE_AABBS_H\n"
"#define B3_UPDATE_AABBS_H\n"
"#ifndef B3_AABB_H\n"
"#define B3_AABB_H\n"
"#ifndef B3_FLOAT4_H\n"
"#define B3_FLOAT4_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#define B3_PLATFORM_DEFINITIONS_H\n"
"struct MyTest\n"
"{\n"
" int bla;\n"
"};\n"
"#ifdef __cplusplus\n"
"#else\n"
"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
"#define B3_LARGE_FLOAT 1e18f\n"
"#define B3_INFINITY 1e18f\n"
"#define b3Assert(a)\n"
"#define b3ConstArray(a) __global const a*\n"
"#define b3AtomicInc atomic_inc\n"
"#define b3AtomicAdd atomic_add\n"
"#define b3Fabs fabs\n"
"#define b3Sqrt native_sqrt\n"
"#define b3Sin native_sin\n"
"#define b3Cos native_cos\n"
"#define B3_STATIC\n"
"#endif\n"
"#endif\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Float4;\n"
" #define b3Float4ConstArg const b3Float4\n"
" #define b3MakeFloat4 (float4)\n"
" float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return dot(a1, b1);\n"
" }\n"
" b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
" {\n"
" float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
" float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
" return cross(a1, b1);\n"
" }\n"
" #define b3MinFloat4 min\n"
" #define b3MaxFloat4 max\n"
" #define b3Normalized(a) normalize(a)\n"
"#endif \n"
" \n"
"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
"{\n"
" if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6) \n"
" return false;\n"
" return true;\n"
"}\n"
"inline int b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
"{\n"
" float maxDot = -B3_INFINITY;\n"
" int i = 0;\n"
" int ptIndex = -1;\n"
" for( i = 0; i < vecLen; i++ )\n"
" {\n"
" float dot = b3Dot3F4(vecArray[i],vec);\n"
" \n"
" if( dot > maxDot )\n"
" {\n"
" maxDot = dot;\n"
" ptIndex = i;\n"
" }\n"
" }\n"
" b3Assert(ptIndex>=0);\n"
" if (ptIndex<0)\n"
" {\n"
" ptIndex = 0;\n"
" }\n"
" *dotOut = maxDot;\n"
" return ptIndex;\n"
"}\n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_MAT3x3_H\n"
"#define B3_MAT3x3_H\n"
"#ifndef B3_QUAT_H\n"
"#define B3_QUAT_H\n"
"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif\n"
"#endif\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
" typedef float4 b3Quat;\n"
" #define b3QuatConstArg const b3Quat\n"
" \n"
" \n"
"inline float4 b3FastNormalize4(float4 v)\n"
"{\n"
" v = (float4)(v.xyz,0.f);\n"
" return fast_normalize(v);\n"
"}\n"
" \n"
"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
"{\n"
" b3Quat ans;\n"
" ans = b3Cross3( a, b );\n"
" ans += a.w*b+b.w*a;\n"
"// ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
" ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
" return ans;\n"
"}\n"
"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
"{\n"
" b3Quat q;\n"
" q=in;\n"
" //return b3FastNormalize4(in);\n"
" float len = native_sqrt(dot(q, q));\n"
" if(len > 0.f)\n"
" {\n"
" q *= 1.f / len;\n"
" }\n"
" else\n"
" {\n"
" q.x = q.y = q.z = 0.f;\n"
" q.w = 1.f;\n"
" }\n"
" return q;\n"
"}\n"
"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" b3Quat qInv = b3QuatInvert( q );\n"
" float4 vcpy = vec;\n"
" vcpy.w = 0.f;\n"
" float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
" return out;\n"
"}\n"
"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
"{\n"
" return (b3Quat)(-q.xyz, q.w);\n"
"}\n"
"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
"{\n"
" return b3QuatRotate( b3QuatInvert( q ), vec );\n"
"}\n"
"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg orientation)\n"
"{\n"
" return b3QuatRotate( orientation, point ) + (translation);\n"
"}\n"
" \n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"typedef struct\n"
"{\n"
" b3Float4 m_row[3];\n"
"}b3Mat3x3;\n"
"#define b3Mat3x3ConstArg const b3Mat3x3\n"
"#define b3GetRow(m,row) (m.m_row[row])\n"
"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
"{\n"
" b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
" b3Mat3x3 out;\n"
" out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
" out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
" out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
" out.m_row[0].w = 0.f;\n"
" out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
" out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
" out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
" out.m_row[1].w = 0.f;\n"
" out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
" out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
" out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
" out.m_row[2].w = 0.f;\n"
" return out;\n"
"}\n"
"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = fabs(matIn.m_row[0]);\n"
" out.m_row[1] = fabs(matIn.m_row[1]);\n"
" out.m_row[2] = fabs(matIn.m_row[2]);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtZero();\n"
"__inline\n"
"b3Mat3x3 mtIdentity();\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
"__inline\n"
"b3Mat3x3 mtZero()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(0.f);\n"
" m.m_row[1] = (b3Float4)(0.f);\n"
" m.m_row[2] = (b3Float4)(0.f);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtIdentity()\n"
"{\n"
" b3Mat3x3 m;\n"
" m.m_row[0] = (b3Float4)(1,0,0,0);\n"
" m.m_row[1] = (b3Float4)(0,1,0,0);\n"
" m.m_row[2] = (b3Float4)(0,0,1,0);\n"
" return m;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
"{\n"
" b3Mat3x3 out;\n"
" out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
" out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
" out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
" return out;\n"
"}\n"
"__inline\n"
"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
"{\n"
" b3Mat3x3 transB;\n"
" transB = mtTranspose( b );\n"
" b3Mat3x3 ans;\n"
" // why this doesn't run when 0ing in the for{}\n"
" a.m_row[0].w = 0.f;\n"
" a.m_row[1].w = 0.f;\n"
" a.m_row[2].w = 0.f;\n"
" for(int i=0; i<3; i++)\n"
" {\n"
"// a.m_row[i].w = 0.f;\n"
" ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
" ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
" ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
" ans.m_row[i].w = 0.f;\n"
" }\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
"{\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a.m_row[0], b );\n"
" ans.y = b3Dot3F4( a.m_row[1], b );\n"
" ans.z = b3Dot3F4( a.m_row[2], b );\n"
" ans.w = 0.f;\n"
" return ans;\n"
"}\n"
"__inline\n"
"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
"{\n"
" b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
" b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
" b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
" b3Float4 ans;\n"
" ans.x = b3Dot3F4( a, colx );\n"
" ans.y = b3Dot3F4( a, coly );\n"
" ans.z = b3Dot3F4( a, colz );\n"
" return ans;\n"
"}\n"
"#endif\n"
"#endif //B3_MAT3x3_H\n"
"typedef struct b3Aabb b3Aabb_t;\n"
"struct b3Aabb\n"
"{\n"
" union\n"
" {\n"
" float m_min[4];\n"
" b3Float4 m_minVec;\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float m_max[4];\n"
" b3Float4 m_maxVec;\n"
" int m_signedMaxIndices[4];\n"
" };\n"
"};\n"
"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
" b3Float4ConstArg pos,\n"
" b3QuatConstArg orn,\n"
" b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
"{\n"
" b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
" localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
" b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
" b3Mat3x3 m;\n"
" m = b3QuatGetRotationMatrix(orn);\n"
" b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
" b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
" \n"
" b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
" b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
" 0.f);\n"
" *aabbMinOut = center-extent;\n"
" *aabbMaxOut = center+extent;\n"
"}\n"
"/// conservative test for overlap between two aabbs\n"
"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
" b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
" overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
" overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"#endif //B3_AABB_H\n"
"#ifndef B3_COLLIDABLE_H\n"
"#define B3_COLLIDABLE_H\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"enum b3ShapeTypes\n"
"{\n"
" SHAPE_HEIGHT_FIELD=1,\n"
" SHAPE_CONVEX_HULL=3,\n"
" SHAPE_PLANE=4,\n"
" SHAPE_CONCAVE_TRIMESH=5,\n"
" SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n"
" SHAPE_SPHERE=7,\n"
" MAX_NUM_SHAPE_TYPES,\n"
"};\n"
"typedef struct b3Collidable b3Collidable_t;\n"
"struct b3Collidable\n"
"{\n"
" union {\n"
" int m_numChildShapes;\n"
" int m_bvhIndex;\n"
" };\n"
" union\n"
" {\n"
" float m_radius;\n"
" int m_compoundBvhIndex;\n"
" };\n"
" int m_shapeType;\n"
" union\n"
" {\n"
" int m_shapeIndex;\n"
" float m_height;\n"
" };\n"
"};\n"
"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
"struct b3GpuChildShape\n"
"{\n"
" b3Float4 m_childPosition;\n"
" b3Quat m_childOrientation;\n"
" union\n"
" {\n"
" int m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
" int m_capsuleAxis;\n"
" };\n"
" union \n"
" {\n"
" float m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n"
" int m_numChildShapes;//used for compound shape\n"
" };\n"
" union \n"
" {\n"
" float m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n"
" int m_collidableShapeIndex;\n"
" };\n"
" int m_shapeType;\n"
"};\n"
"struct b3CompoundOverlappingPair\n"
"{\n"
" int m_bodyIndexA;\n"
" int m_bodyIndexB;\n"
"// int m_pairType;\n"
" int m_childShapeIndexA;\n"
" int m_childShapeIndexB;\n"
"};\n"
"#endif //B3_COLLIDABLE_H\n"
"#ifndef B3_RIGIDBODY_DATA_H\n"
"#define B3_RIGIDBODY_DATA_H\n"
"#ifndef B3_FLOAT4_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_FLOAT4_H\n"
"#ifndef B3_QUAT_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif \n"
"#endif //B3_QUAT_H\n"
"#ifndef B3_MAT3x3_H\n"
"#ifdef __cplusplus\n"
"#else\n"
"#endif\n"
"#endif //B3_MAT3x3_H\n"
"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
"struct b3RigidBodyData\n"
"{\n"
" b3Float4 m_pos;\n"
" b3Quat m_quat;\n"
" b3Float4 m_linVel;\n"
" b3Float4 m_angVel;\n"
" int m_collidableIdx;\n"
" float m_invMass;\n"
" float m_restituitionCoeff;\n"
" float m_frictionCoeff;\n"
"};\n"
"typedef struct b3InertiaData b3InertiaData_t;\n"
"struct b3InertiaData\n"
"{\n"
" b3Mat3x3 m_invInertiaWorld;\n"
" b3Mat3x3 m_initInvInertia;\n"
"};\n"
"#endif //B3_RIGIDBODY_DATA_H\n"
" \n"
"void b3ComputeWorldAabb( int bodyId, __global const b3RigidBodyData_t* bodies, __global const b3Collidable_t* collidables, __global const b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n"
"{\n"
" __global const b3RigidBodyData_t* body = &bodies[bodyId];\n"
" b3Float4 position = body->m_pos;\n"
" b3Quat orientation = body->m_quat;\n"
" \n"
" int collidableIndex = body->m_collidableIdx;\n"
" int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n"
" \n"
" if (shapeIndex>=0)\n"
" {\n"
" \n"
" b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n"
" b3Aabb_t worldAabb;\n"
" \n"
" b3Float4 aabbAMinOut,aabbAMaxOut; \n"
" float margin = 0.f;\n"
" b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n"
" \n"
" worldAabb.m_minVec =aabbAMinOut;\n"
" worldAabb.m_minIndices[3] = bodyId;\n"
" worldAabb.m_maxVec = aabbAMaxOut;\n"
" worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n"
" worldAabbs[bodyId] = worldAabb;\n"
" }\n"
"}\n"
"#endif //B3_UPDATE_AABBS_H\n"
"__kernel void initializeGpuAabbsFull( const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n"
"{\n"
" int nodeID = get_global_id(0);\n"
" if( nodeID < numNodes )\n"
" {\n"
" b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n"
" }\n"
"}\n"
"__kernel void clearOverlappingPairsKernel( __global int4* pairs, int numPairs)\n"
"{\n"
" int pairId = get_global_id(0);\n"
" if( pairId< numPairs )\n"
" {\n"
" pairs[pairId].z = 0xffffffff;\n"
" }\n"
"}\n";