reorder files, in preparation for Bullet 3 -> Bullet 2 merge
This commit is contained in:
565
src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
Normal file
565
src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
Normal file
@@ -0,0 +1,565 @@
|
||||
|
||||
#include "b3GpuSapBroadphase.h"
|
||||
#include "Bullet3Common/b3Vector3.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
|
||||
#include "Bullet3Common/b3Quickprof.h"
|
||||
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
|
||||
#include "kernels/sapKernels.h"
|
||||
#include "kernels/sapFastKernels.h"
|
||||
#include "Bullet3Common/b3MinMax.h"
|
||||
|
||||
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
|
||||
#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
|
||||
|
||||
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
|
||||
:m_context(ctx),
|
||||
m_device(device),
|
||||
m_queue(q),
|
||||
m_allAabbsGPU(ctx,q),
|
||||
m_smallAabbsGPU(ctx,q),
|
||||
m_largeAabbsGPU(ctx,q),
|
||||
m_overlappingPairs(ctx,q),
|
||||
m_gpuSmallSortData(ctx,q),
|
||||
m_gpuSmallSortedAabbs(ctx,q),
|
||||
m_currentBuffer(-1)
|
||||
{
|
||||
const char* sapSrc = sapCL;
|
||||
const char* sapFastSrc = sapFastCL;
|
||||
|
||||
cl_int errNum=0;
|
||||
|
||||
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
|
||||
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
|
||||
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
|
||||
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
|
||||
|
||||
|
||||
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
|
||||
#if 0
|
||||
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
#else
|
||||
#ifndef __APPLE__
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
#else
|
||||
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
|
||||
b3Assert(errNum==CL_SUCCESS);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
|
||||
|
||||
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
|
||||
|
||||
m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
|
||||
|
||||
m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
|
||||
}
|
||||
|
||||
b3GpuSapBroadphase::~b3GpuSapBroadphase()
|
||||
{
|
||||
delete m_sorter;
|
||||
clReleaseKernel(m_scatterKernel);
|
||||
clReleaseKernel(m_flipFloatKernel);
|
||||
clReleaseKernel(m_copyAabbsKernel);
|
||||
clReleaseKernel(m_sapKernel);
|
||||
clReleaseKernel(m_sap2Kernel);
|
||||
|
||||
}
|
||||
|
||||
/// conservative test for overlap between two aabbs
|
||||
static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
|
||||
const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
|
||||
overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
|
||||
overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//http://stereopsis.com/radix.html
|
||||
static unsigned int FloatFlip(float fl)
|
||||
{
|
||||
unsigned int f = *(unsigned int*)&fl;
|
||||
unsigned int mask = -(int)(f >> 31) | 0x80000000;
|
||||
return f ^ mask;
|
||||
};
|
||||
|
||||
void b3GpuSapBroadphase::init3dSap()
|
||||
{
|
||||
if (m_currentBuffer<0)
|
||||
{
|
||||
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
|
||||
|
||||
m_currentBuffer = 0;
|
||||
for (int axis=0;axis<3;axis++)
|
||||
{
|
||||
for (int buf=0;buf<2;buf++)
|
||||
{
|
||||
int totalNumAabbs = m_allAabbsCPU.size();
|
||||
m_sortedAxisCPU[axis][buf].resize(totalNumAabbs);
|
||||
|
||||
if (buf==m_currentBuffer)
|
||||
{
|
||||
for (int i=0;i<totalNumAabbs;i++)
|
||||
{
|
||||
m_sortedAxisCPU[axis][buf][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
|
||||
m_sortedAxisCPU[axis][buf][i].m_value = i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
|
||||
{
|
||||
b3Assert(m_currentBuffer>=0);
|
||||
if (m_currentBuffer<0)
|
||||
return;
|
||||
|
||||
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
|
||||
|
||||
for (int axis=0;axis<3;axis++)
|
||||
{
|
||||
for (int buf=0;buf<2;buf++)
|
||||
{
|
||||
b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
m_currentBuffer = 1-m_currentBuffer;
|
||||
|
||||
for (int axis=0;axis<3;axis++)
|
||||
{
|
||||
int totalNumAabbs = m_allAabbsCPU.size();
|
||||
for (int i=0;i<totalNumAabbs;i++)
|
||||
{
|
||||
m_sortedAxisCPU[axis][m_currentBuffer][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
|
||||
m_sortedAxisCPU[axis][m_currentBuffer][i].m_value = i;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void b3GpuSapBroadphase::calculateOverlappingPairsHost()
|
||||
{
|
||||
//test
|
||||
//if (m_currentBuffer>=0)
|
||||
// calculateOverlappingPairsHostIncremental3Sap();
|
||||
|
||||
int axis=0;
|
||||
|
||||
b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
|
||||
|
||||
|
||||
|
||||
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
|
||||
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int j=0;j<numSmallAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
|
||||
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsCPU.size();
|
||||
for (int j=0;j<numLargeAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
|
||||
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
b3AlignedObjectArray<b3Int2> hostPairs;
|
||||
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int i=0;i<numSmallAabbs;i++)
|
||||
{
|
||||
float reference = m_smallAabbsCPU[i].m_max[axis];
|
||||
|
||||
for (int j=i+1;j<numSmallAabbs;j++)
|
||||
{
|
||||
if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
|
||||
(b3Vector3&)m_smallAabbsCPU[j].m_min,(b3Vector3&)m_smallAabbsCPU[j].m_max))
|
||||
{
|
||||
b3Int2 pair;
|
||||
pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
|
||||
pair.y = m_smallAabbsCPU[j].m_minIndices[3];
|
||||
hostPairs.push_back(pair);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int i=0;i<numSmallAabbs;i++)
|
||||
{
|
||||
float reference = m_smallAabbsCPU[i].m_max[axis];
|
||||
int numLargeAabbs = m_largeAabbsCPU.size();
|
||||
|
||||
for (int j=0;j<numLargeAabbs;j++)
|
||||
{
|
||||
if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
|
||||
(b3Vector3&)m_largeAabbsCPU[j].m_min,(b3Vector3&)m_largeAabbsCPU[j].m_max))
|
||||
{
|
||||
b3Int2 pair;
|
||||
pair.x = m_largeAabbsCPU[j].m_minIndices[3];
|
||||
pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
|
||||
hostPairs.push_back(pair);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (hostPairs.size())
|
||||
{
|
||||
m_overlappingPairs.copyFromHost(hostPairs);
|
||||
} else
|
||||
{
|
||||
m_overlappingPairs.resize(0);
|
||||
}
|
||||
|
||||
//init3dSap();
|
||||
|
||||
}
|
||||
|
||||
void b3GpuSapBroadphase::calculateOverlappingPairs()
|
||||
{
|
||||
int axis = 0;//todo on GPU for now hardcode
|
||||
|
||||
|
||||
|
||||
{
|
||||
|
||||
bool syncOnHost = false;
|
||||
|
||||
if (syncOnHost)
|
||||
{
|
||||
B3_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
|
||||
|
||||
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
|
||||
|
||||
m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int j=0;j<numSmallAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
|
||||
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
}
|
||||
}
|
||||
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
|
||||
|
||||
} else
|
||||
{
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsGPU.size();
|
||||
if (numSmallAabbs)
|
||||
{
|
||||
B3_PROFILE("copyAabbsKernelSmall");
|
||||
b3BufferInfoCL bInfo[] = {
|
||||
b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
|
||||
b3BufferInfoCL( m_smallAabbsGPU.getBufferCL()),
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs );
|
||||
int num = numSmallAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (syncOnHost)
|
||||
{
|
||||
B3_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
|
||||
|
||||
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
|
||||
|
||||
m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsCPU.size();
|
||||
for (int j=0;j<numLargeAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
|
||||
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
}
|
||||
}
|
||||
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
|
||||
|
||||
} else
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsGPU.size();
|
||||
|
||||
if (numLargeAabbs)
|
||||
{
|
||||
B3_PROFILE("copyAabbsKernelLarge");
|
||||
b3BufferInfoCL bInfo[] = {
|
||||
b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
|
||||
b3BufferInfoCL( m_largeAabbsGPU.getBufferCL()),
|
||||
};
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst( numLargeAabbs );
|
||||
int num = numLargeAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
B3_PROFILE("GPU SAP");
|
||||
|
||||
int numSmallAabbs = m_smallAabbsGPU.size();
|
||||
m_gpuSmallSortData.resize(numSmallAabbs);
|
||||
int numLargeAabbs = m_smallAabbsGPU.size();
|
||||
|
||||
#if 1
|
||||
if (m_smallAabbsGPU.size())
|
||||
{
|
||||
B3_PROFILE("flipFloatKernel");
|
||||
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())};
|
||||
b3LauncherCL launcher(m_queue, m_flipFloatKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs );
|
||||
launcher.setConst( axis );
|
||||
|
||||
int num = numSmallAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
{
|
||||
B3_PROFILE("gpu radix sort\n");
|
||||
m_sorter->execute(m_gpuSmallSortData);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
m_gpuSmallSortedAabbs.resize(numSmallAabbs);
|
||||
if (numSmallAabbs)
|
||||
{
|
||||
B3_PROFILE("scatterKernel");
|
||||
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
|
||||
b3LauncherCL launcher(m_queue, m_scatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs);
|
||||
int num = numSmallAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
|
||||
}
|
||||
|
||||
|
||||
int maxPairsPerBody = 64;
|
||||
int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
|
||||
m_overlappingPairs.resize(maxPairs);
|
||||
|
||||
b3OpenCLArray<int> pairCount(m_context, m_queue);
|
||||
pairCount.push_back(0);
|
||||
int numPairs=0;
|
||||
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsGPU.size();
|
||||
if (numLargeAabbs && numSmallAabbs)
|
||||
{
|
||||
B3_PROFILE("sap2Kernel");
|
||||
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_largeAabbsGPU.getBufferCL() ),b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
|
||||
b3LauncherCL launcher(m_queue, m_sap2Kernel);
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst( numLargeAabbs );
|
||||
launcher.setConst( numSmallAabbs);
|
||||
launcher.setConst( axis );
|
||||
launcher.setConst( maxPairs );
|
||||
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
|
||||
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
|
||||
|
||||
numPairs = pairCount.at(0);
|
||||
if (numPairs >maxPairs)
|
||||
numPairs =maxPairs;
|
||||
|
||||
}
|
||||
}
|
||||
if (m_gpuSmallSortedAabbs.size())
|
||||
{
|
||||
B3_PROFILE("sapKernel");
|
||||
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
|
||||
b3LauncherCL launcher(m_queue, m_sapKernel);
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs );
|
||||
launcher.setConst( axis );
|
||||
launcher.setConst( maxPairs );
|
||||
|
||||
|
||||
int num = numSmallAabbs;
|
||||
#if 0
|
||||
int buffSize = launcher.getSerializationBufferSize();
|
||||
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
|
||||
for (int i=0;i<buffSize+1;i++)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)&buf[i];
|
||||
*ptr = 0xff;
|
||||
}
|
||||
int actualWrite = launcher.serializeArguments(buf,buffSize);
|
||||
|
||||
unsigned char* cptr = (unsigned char*)&buf[buffSize];
|
||||
// printf("buf[buffSize] = %d\n",*cptr);
|
||||
|
||||
assert(buf[buffSize]==0xff);//check for buffer overrun
|
||||
int* ptr = (int*)&buf[buffSize];
|
||||
|
||||
*ptr = num;
|
||||
|
||||
FILE* f = fopen("m_sapKernelArgs.bin","wb");
|
||||
fwrite(buf,buffSize+sizeof(int),1,f);
|
||||
fclose(f);
|
||||
#endif//
|
||||
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
|
||||
numPairs = pairCount.at(0);
|
||||
if (numPairs>maxPairs)
|
||||
numPairs = maxPairs;
|
||||
}
|
||||
|
||||
#else
|
||||
int numPairs = 0;
|
||||
|
||||
|
||||
b3LauncherCL launcher(m_queue, m_sapKernel);
|
||||
|
||||
const char* fileName = "m_sapKernelArgs.bin";
|
||||
FILE* f = fopen(fileName,"rb");
|
||||
if (f)
|
||||
{
|
||||
int sizeInBytes=0;
|
||||
if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
|
||||
{
|
||||
printf("error, cannot get file size\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
|
||||
fread(buf,sizeInBytes,1,f);
|
||||
int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
|
||||
int num = *(int*)&buf[serializedBytes];
|
||||
launcher.launch1D( num);
|
||||
|
||||
b3OpenCLArray<int> pairCount(m_context, m_queue);
|
||||
int numElements = launcher.m_arrays[2]->size()/sizeof(int);
|
||||
pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
|
||||
numPairs = pairCount.at(0);
|
||||
//printf("overlapping pairs = %d\n",numPairs);
|
||||
b3AlignedObjectArray<b3Int2> hostOoverlappingPairs;
|
||||
b3OpenCLArray<b3Int2> tmpGpuPairs(m_context,m_queue);
|
||||
tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
|
||||
|
||||
tmpGpuPairs.copyToHost(hostOoverlappingPairs);
|
||||
m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
|
||||
//printf("hello %d\n", m_overlappingPairs.size());
|
||||
free(buf);
|
||||
fclose(f);
|
||||
|
||||
} else {
|
||||
printf("error: cannot find file %s\n",fileName);
|
||||
}
|
||||
|
||||
clFinish(m_queue);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
m_overlappingPairs.resize(numPairs);
|
||||
|
||||
}//B3_PROFILE("GPU_RADIX SORT");
|
||||
|
||||
|
||||
}
|
||||
|
||||
void b3GpuSapBroadphase::writeAabbsToGpu()
|
||||
{
|
||||
m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
|
||||
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
|
||||
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
|
||||
|
||||
}
|
||||
|
||||
void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
|
||||
{
|
||||
int index = userPtr;
|
||||
b3SapAabb aabb;
|
||||
for (int i=0;i<4;i++)
|
||||
{
|
||||
aabb.m_min[i] = aabbMin[i];
|
||||
aabb.m_max[i] = aabbMax[i];
|
||||
}
|
||||
aabb.m_minIndices[3] = index;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
|
||||
m_largeAabbsCPU.push_back(aabb);
|
||||
m_allAabbsCPU.push_back(aabb);
|
||||
}
|
||||
|
||||
void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
|
||||
{
|
||||
int index = userPtr;
|
||||
b3SapAabb aabb;
|
||||
for (int i=0;i<4;i++)
|
||||
{
|
||||
aabb.m_min[i] = aabbMin[i];
|
||||
aabb.m_max[i] = aabbMax[i];
|
||||
}
|
||||
aabb.m_minIndices[3] = index;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
|
||||
m_smallAabbsCPU.push_back(aabb);
|
||||
m_allAabbsCPU.push_back(aabb);
|
||||
}
|
||||
|
||||
cl_mem b3GpuSapBroadphase::getAabbBufferWS()
|
||||
{
|
||||
return m_allAabbsGPU.getBufferCL();
|
||||
}
|
||||
|
||||
int b3GpuSapBroadphase::getNumOverlap()
|
||||
{
|
||||
return m_overlappingPairs.size();
|
||||
}
|
||||
cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
|
||||
{
|
||||
return m_overlappingPairs.getBufferCL();
|
||||
}
|
||||
69
src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
Normal file
69
src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
Normal file
@@ -0,0 +1,69 @@
|
||||
#ifndef B3_GPU_SAP_BROADPHASE_H
|
||||
#define B3_GPU_SAP_BROADPHASE_H
|
||||
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
|
||||
class b3Vector3;
|
||||
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
|
||||
|
||||
#include "b3SapAabb.h"
|
||||
|
||||
|
||||
|
||||
class b3GpuSapBroadphase
|
||||
{
|
||||
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
cl_kernel m_flipFloatKernel;
|
||||
cl_kernel m_scatterKernel ;
|
||||
cl_kernel m_copyAabbsKernel;
|
||||
cl_kernel m_sapKernel;
|
||||
cl_kernel m_sap2Kernel;
|
||||
|
||||
class b3RadixSort32CL* m_sorter;
|
||||
|
||||
///test for 3d SAP
|
||||
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
|
||||
int m_currentBuffer;
|
||||
|
||||
public:
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
|
||||
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_smallAabbsGPU;
|
||||
b3AlignedObjectArray<b3SapAabb> m_smallAabbsCPU;
|
||||
|
||||
b3OpenCLArray<b3SapAabb> m_largeAabbsGPU;
|
||||
b3AlignedObjectArray<b3SapAabb> m_largeAabbsCPU;
|
||||
|
||||
b3OpenCLArray<b3Int2> m_overlappingPairs;
|
||||
|
||||
//temporary gpu work memory
|
||||
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
|
||||
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
|
||||
|
||||
|
||||
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
|
||||
virtual ~b3GpuSapBroadphase();
|
||||
|
||||
void calculateOverlappingPairs();
|
||||
void calculateOverlappingPairsHost();
|
||||
|
||||
void init3dSap();
|
||||
void calculateOverlappingPairsHostIncremental3Sap();
|
||||
|
||||
void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
|
||||
void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
void writeAabbsToGpu();
|
||||
|
||||
cl_mem getAabbBufferWS();
|
||||
int getNumOverlap();
|
||||
cl_mem getOverlappingPairBuffer();
|
||||
};
|
||||
|
||||
#endif //B3_GPU_SAP_BROADPHASE_H
|
||||
18
src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
Normal file
18
src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#ifndef B3_SAP_AABB_H
|
||||
#define B3_SAP_AABB_H
|
||||
|
||||
struct b3SapAabb
|
||||
{
|
||||
union
|
||||
{
|
||||
float m_min[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float m_max[4];
|
||||
int m_signedMaxIndices[4];
|
||||
};
|
||||
};
|
||||
|
||||
#endif //B3_SAP_AABB_H
|
||||
320
src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
Normal file
320
src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
Normal file
@@ -0,0 +1,320 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Erwin Coumans
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
float4 m_min;
|
||||
float m_minElems[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float4 m_max;
|
||||
float m_maxElems[4];
|
||||
int m_maxIndices[4];
|
||||
};
|
||||
} btAabbCL;
|
||||
|
||||
|
||||
/// conservative test for overlap between two aabbs
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
|
||||
__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numUnsortedAabbs)
|
||||
return;
|
||||
|
||||
int j = get_global_id(1);
|
||||
if (j>=numSortedAabbs)
|
||||
return;
|
||||
|
||||
if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
|
||||
{
|
||||
int2 myPair;
|
||||
|
||||
myPair.x = unsortedAabbs[i].m_minIndices[3];
|
||||
myPair.y = sortedAabbs[j].m_minIndices[3];
|
||||
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
for (int j=i+1;j<numObjects;j++)
|
||||
{
|
||||
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = aabbs[i].m_minIndices[3];
|
||||
myPair.y = aabbs[j].m_minIndices[3];
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
int localId = get_local_id(0);
|
||||
|
||||
__local int numActiveWgItems[1];
|
||||
__local int breakRequest[1];
|
||||
|
||||
if (localId==0)
|
||||
{
|
||||
numActiveWgItems[0] = 0;
|
||||
breakRequest[0] = 0;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_inc(numActiveWgItems);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int localBreak = 0;
|
||||
|
||||
int j=i+1;
|
||||
do
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j<numObjects)
|
||||
{
|
||||
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
|
||||
{
|
||||
if (!localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j>=numObjects && !localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (!localBreak)
|
||||
{
|
||||
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = aabbs[i].m_minIndices[3];
|
||||
myPair.y = aabbs[j].m_minIndices[3];
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
j++;
|
||||
|
||||
} while (breakRequest[0]<numActiveWgItems[0]);
|
||||
}
|
||||
|
||||
|
||||
__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
int localId = get_local_id(0);
|
||||
|
||||
__local int numActiveWgItems[1];
|
||||
__local int breakRequest[1];
|
||||
__local btAabbCL localAabbs[128];// = aabbs[i];
|
||||
|
||||
btAabbCL myAabb;
|
||||
|
||||
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
|
||||
float testValue = myAabb.m_maxElems[axis];
|
||||
|
||||
if (localId==0)
|
||||
{
|
||||
numActiveWgItems[0] = 0;
|
||||
breakRequest[0] = 0;
|
||||
}
|
||||
int localCount=0;
|
||||
int block=0;
|
||||
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_inc(numActiveWgItems);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int localBreak = 0;
|
||||
|
||||
int j=i+1;
|
||||
do
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j<numObjects)
|
||||
{
|
||||
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
|
||||
{
|
||||
if (!localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j>=numObjects && !localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (!localBreak)
|
||||
{
|
||||
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = myAabb.m_minIndices[3];
|
||||
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localCount++;
|
||||
if (localCount==64)
|
||||
{
|
||||
localCount = 0;
|
||||
block+=64;
|
||||
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
|
||||
}
|
||||
j++;
|
||||
|
||||
} while (breakRequest[0]<numActiveWgItems[0]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//http://stereopsis.com/radix.html
|
||||
unsigned int FloatFlip(float fl);
|
||||
unsigned int FloatFlip(float fl)
|
||||
{
|
||||
unsigned int f = *(unsigned int*)&fl;
|
||||
unsigned int mask = -(int)(f >> 31) | 0x80000000;
|
||||
return f ^ mask;
|
||||
}
|
||||
float IFloatFlip(unsigned int f);
|
||||
float IFloatFlip(unsigned int f)
|
||||
{
|
||||
unsigned int mask = ((f >> 31) - 1) | 0x80000000;
|
||||
unsigned int fl = f ^ mask;
|
||||
return *(float*)&fl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
int src = destAabbs[i].m_maxIndices[3];
|
||||
destAabbs[i] = allAabbs[src];
|
||||
destAabbs[i].m_maxIndices[3] = src;
|
||||
}
|
||||
|
||||
|
||||
__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
|
||||
sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
|
||||
sortData[i].y = i;
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
|
||||
sortedAabbs[i] = aabbs[sortData[i].y];
|
||||
}
|
||||
161
src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
Normal file
161
src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
Normal file
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Erwin Coumans
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
float4 m_min;
|
||||
float m_minElems[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float4 m_max;
|
||||
float m_maxElems[4];
|
||||
int m_maxIndices[4];
|
||||
};
|
||||
} btAabbCL;
|
||||
|
||||
|
||||
/// conservative test for overlap between two aabbs
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
|
||||
{
|
||||
//skip pairs between static (mass=0) objects
|
||||
if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
|
||||
return false;
|
||||
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
|
||||
//computePairsKernelBatchWrite
|
||||
__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
int localId = get_local_id(0);
|
||||
|
||||
__local int numActiveWgItems[1];
|
||||
__local int breakRequest[1];
|
||||
__local btAabbCL localAabbs[128];// = aabbs[i];
|
||||
|
||||
int2 myPairs[64];
|
||||
|
||||
btAabbCL myAabb;
|
||||
|
||||
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
|
||||
float testValue = myAabb.m_maxElems[axis];
|
||||
|
||||
if (localId==0)
|
||||
{
|
||||
numActiveWgItems[0] = 0;
|
||||
breakRequest[0] = 0;
|
||||
}
|
||||
int localCount=0;
|
||||
int block=0;
|
||||
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_inc(numActiveWgItems);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int localBreak = 0;
|
||||
int curNumPairs = 0;
|
||||
|
||||
int j=i+1;
|
||||
do
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j<numObjects)
|
||||
{
|
||||
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
|
||||
{
|
||||
if (!localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j>=numObjects && !localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (!localBreak)
|
||||
{
|
||||
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = myAabb.m_minIndices[3];
|
||||
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
|
||||
myPairs[curNumPairs] = myPair;
|
||||
curNumPairs++;
|
||||
if (curNumPairs==64)
|
||||
{
|
||||
int curPair = atomic_add(pairCount,curNumPairs);
|
||||
//avoid a buffer overrun
|
||||
if ((curPair+curNumPairs)<maxPairs)
|
||||
{
|
||||
for (int p=0;p<curNumPairs;p++)
|
||||
{
|
||||
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
|
||||
}
|
||||
}
|
||||
curNumPairs = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localCount++;
|
||||
if (localCount==64)
|
||||
{
|
||||
localCount = 0;
|
||||
block+=64;
|
||||
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
|
||||
}
|
||||
j++;
|
||||
|
||||
} while (breakRequest[0]<numActiveWgItems[0]);
|
||||
|
||||
|
||||
if (curNumPairs>0)
|
||||
{
|
||||
//avoid a buffer overrun
|
||||
int curPair = atomic_add(pairCount,curNumPairs);
|
||||
if ((curPair+curNumPairs)<maxPairs)
|
||||
{
|
||||
for (int p=0;p<curNumPairs;p++)
|
||||
{
|
||||
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
|
||||
}
|
||||
}
|
||||
curNumPairs = 0;
|
||||
}
|
||||
}
|
||||
164
src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
Normal file
164
src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
Normal file
@@ -0,0 +1,164 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* sapFastCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Erwin Coumans\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
"//skip pairs between static (mass=0) objects\n"
|
||||
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
|
||||
" return false;\n"
|
||||
" \n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"//computePairsKernelBatchWrite\n"
|
||||
"__kernel void computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" int2 myPairs[64];\n"
|
||||
" \n"
|
||||
" b3AabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
" \n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" int localCount=0;\n"
|
||||
" int block=0;\n"
|
||||
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" int curNumPairs = 0;\n"
|
||||
" \n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = myAabb.m_minIndices[3];\n"
|
||||
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
|
||||
" myPairs[curNumPairs] = myPair;\n"
|
||||
" curNumPairs++;\n"
|
||||
" if (curNumPairs==64)\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_add(pairCount,curNumPairs);\n"
|
||||
" //avoid a buffer overrun\n"
|
||||
" if ((curPair+curNumPairs)<maxPairs)\n"
|
||||
" {\n"
|
||||
" for (int p=0;p<curNumPairs;p++)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" curNumPairs = 0;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" localCount++;\n"
|
||||
" if (localCount==64)\n"
|
||||
" {\n"
|
||||
" localCount = 0;\n"
|
||||
" block+=64; \n"
|
||||
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" \n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" if (curNumPairs>0)\n"
|
||||
" {\n"
|
||||
" //avoid a buffer overrun\n"
|
||||
" int curPair = atomic_add(pairCount,curNumPairs);\n"
|
||||
" if ((curPair+curNumPairs)<maxPairs)\n"
|
||||
" {\n"
|
||||
" for (int p=0;p<curNumPairs;p++)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" curNumPairs = 0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
324
src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
Normal file
324
src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
Normal file
@@ -0,0 +1,324 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* sapCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Erwin Coumans\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} b3AabbCL;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numUnsortedAabbs)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" int j = get_global_id(1);\n"
|
||||
" if (j>=numSortedAabbs)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" \n"
|
||||
" myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = sortedAabbs[j].m_minIndices[3];\n"
|
||||
"\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" for (int j=i+1;j<numObjects;j++)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
"\n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
"\n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
"\n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" b3AabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
" \n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" int localCount=0;\n"
|
||||
" int block=0;\n"
|
||||
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" \n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = myAabb.m_minIndices[3];\n"
|
||||
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
" localCount++;\n"
|
||||
" if (localCount==64)\n"
|
||||
" {\n"
|
||||
" localCount = 0;\n"
|
||||
" block+=64; \n"
|
||||
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" \n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"//http://stereopsis.com/radix.html\n"
|
||||
"unsigned int FloatFlip(float fl);\n"
|
||||
"unsigned int FloatFlip(float fl)\n"
|
||||
"{\n"
|
||||
" unsigned int f = *(unsigned int*)&fl;\n"
|
||||
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
|
||||
" return f ^ mask;\n"
|
||||
"}\n"
|
||||
"float IFloatFlip(unsigned int f);\n"
|
||||
"float IFloatFlip(unsigned int f)\n"
|
||||
"{\n"
|
||||
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
|
||||
" unsigned int fl = f ^ mask;\n"
|
||||
" return *(float*)&fl;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" int src = destAabbs[i].m_maxIndices[3];\n"
|
||||
" destAabbs[i] = allAabbs[src];\n"
|
||||
" destAabbs[i].m_maxIndices[3] = src;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
|
||||
" sortData[i].y = i;\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" sortedAabbs[i] = aabbs[sortData[i].y];\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
Reference in New Issue
Block a user