reorder files, in preparation for Bullet 3 -> Bullet 2 merge

This commit is contained in:
erwincoumans
2013-04-29 19:04:08 -07:00
parent 55b69201a9
commit 3ac332f3a7
162 changed files with 215 additions and 3070 deletions

View File

@@ -0,0 +1,565 @@
#include "b3GpuSapBroadphase.h"
#include "Bullet3Common/b3Vector3.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
#include "Bullet3Common/b3Quickprof.h"
#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
#include "kernels/sapKernels.h"
#include "kernels/sapFastKernels.h"
#include "Bullet3Common/b3MinMax.h"
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
:m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU(ctx,q),
m_smallAabbsGPU(ctx,q),
m_largeAabbsGPU(ctx,q),
m_overlappingPairs(ctx,q),
m_gpuSmallSortData(ctx,q),
m_gpuSmallSortedAabbs(ctx,q),
m_currentBuffer(-1)
{
const char* sapSrc = sapCL;
const char* sapFastSrc = sapFastCL;
cl_int errNum=0;
cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
b3Assert(errNum==CL_SUCCESS);
cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
b3Assert(errNum==CL_SUCCESS);
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#if 0
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#else
#ifndef __APPLE__
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
b3Assert(errNum==CL_SUCCESS);
#else
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#endif
#endif
m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
}
b3GpuSapBroadphase::~b3GpuSapBroadphase()
{
delete m_sorter;
clReleaseKernel(m_scatterKernel);
clReleaseKernel(m_flipFloatKernel);
clReleaseKernel(m_copyAabbsKernel);
clReleaseKernel(m_sapKernel);
clReleaseKernel(m_sap2Kernel);
}
/// conservative test for overlap between two aabbs
static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2)
{
bool overlap = true;
overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
return overlap;
}
//http://stereopsis.com/radix.html
static unsigned int FloatFlip(float fl)
{
unsigned int f = *(unsigned int*)&fl;
unsigned int mask = -(int)(f >> 31) | 0x80000000;
return f ^ mask;
};
void b3GpuSapBroadphase::init3dSap()
{
if (m_currentBuffer<0)
{
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
m_currentBuffer = 0;
for (int axis=0;axis<3;axis++)
{
for (int buf=0;buf<2;buf++)
{
int totalNumAabbs = m_allAabbsCPU.size();
m_sortedAxisCPU[axis][buf].resize(totalNumAabbs);
if (buf==m_currentBuffer)
{
for (int i=0;i<totalNumAabbs;i++)
{
m_sortedAxisCPU[axis][buf][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
m_sortedAxisCPU[axis][buf][i].m_value = i;
}
}
}
}
}
}
void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
{
b3Assert(m_currentBuffer>=0);
if (m_currentBuffer<0)
return;
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
for (int axis=0;axis<3;axis++)
{
for (int buf=0;buf<2;buf++)
{
b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size());
}
}
m_currentBuffer = 1-m_currentBuffer;
for (int axis=0;axis<3;axis++)
{
int totalNumAabbs = m_allAabbsCPU.size();
for (int i=0;i<totalNumAabbs;i++)
{
m_sortedAxisCPU[axis][m_currentBuffer][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
m_sortedAxisCPU[axis][m_currentBuffer][i].m_value = i;
}
}
}
void b3GpuSapBroadphase::calculateOverlappingPairsHost()
{
//test
//if (m_currentBuffer>=0)
// calculateOverlappingPairsHostIncremental3Sap();
int axis=0;
b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int j=0;j<numSmallAabbs;j++)
{
//sync aabb
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
{
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
//sync aabb
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
b3AlignedObjectArray<b3Int2> hostPairs;
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int i=0;i<numSmallAabbs;i++)
{
float reference = m_smallAabbsCPU[i].m_max[axis];
for (int j=i+1;j<numSmallAabbs;j++)
{
if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
(b3Vector3&)m_smallAabbsCPU[j].m_min,(b3Vector3&)m_smallAabbsCPU[j].m_max))
{
b3Int2 pair;
pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
pair.y = m_smallAabbsCPU[j].m_minIndices[3];
hostPairs.push_back(pair);
}
}
}
}
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int i=0;i<numSmallAabbs;i++)
{
float reference = m_smallAabbsCPU[i].m_max[axis];
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
(b3Vector3&)m_largeAabbsCPU[j].m_min,(b3Vector3&)m_largeAabbsCPU[j].m_max))
{
b3Int2 pair;
pair.x = m_largeAabbsCPU[j].m_minIndices[3];
pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
hostPairs.push_back(pair);
}
}
}
}
if (hostPairs.size())
{
m_overlappingPairs.copyFromHost(hostPairs);
} else
{
m_overlappingPairs.resize(0);
}
//init3dSap();
}
void b3GpuSapBroadphase::calculateOverlappingPairs()
{
int axis = 0;//todo on GPU for now hardcode
{
bool syncOnHost = false;
if (syncOnHost)
{
B3_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int j=0;j<numSmallAabbs;j++)
{
//sync aabb
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
} else
{
{
int numSmallAabbs = m_smallAabbsGPU.size();
if (numSmallAabbs)
{
B3_PROFILE("copyAabbsKernelSmall");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
b3BufferInfoCL( m_smallAabbsGPU.getBufferCL()),
};
b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
}
}
if (syncOnHost)
{
B3_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
m_allAabbsGPU.copyToHost(m_allAabbsCPU);
m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
{
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
//sync aabb
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
} else
{
int numLargeAabbs = m_largeAabbsGPU.size();
if (numLargeAabbs)
{
B3_PROFILE("copyAabbsKernelLarge");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
b3BufferInfoCL( m_largeAabbsGPU.getBufferCL()),
};
b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numLargeAabbs );
int num = numLargeAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
}
B3_PROFILE("GPU SAP");
int numSmallAabbs = m_smallAabbsGPU.size();
m_gpuSmallSortData.resize(numSmallAabbs);
int numLargeAabbs = m_smallAabbsGPU.size();
#if 1
if (m_smallAabbsGPU.size())
{
B3_PROFILE("flipFloatKernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())};
b3LauncherCL launcher(m_queue, m_flipFloatKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
launcher.setConst( axis );
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
{
B3_PROFILE("gpu radix sort\n");
m_sorter->execute(m_gpuSmallSortData);
clFinish(m_queue);
}
m_gpuSmallSortedAabbs.resize(numSmallAabbs);
if (numSmallAabbs)
{
B3_PROFILE("scatterKernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
b3LauncherCL launcher(m_queue, m_scatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs);
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
int maxPairsPerBody = 64;
int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
m_overlappingPairs.resize(maxPairs);
b3OpenCLArray<int> pairCount(m_context, m_queue);
pairCount.push_back(0);
int numPairs=0;
{
int numLargeAabbs = m_largeAabbsGPU.size();
if (numLargeAabbs && numSmallAabbs)
{
B3_PROFILE("sap2Kernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_largeAabbsGPU.getBufferCL() ),b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
b3LauncherCL launcher(m_queue, m_sap2Kernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numLargeAabbs );
launcher.setConst( numSmallAabbs);
launcher.setConst( axis );
launcher.setConst( maxPairs );
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
numPairs = pairCount.at(0);
if (numPairs >maxPairs)
numPairs =maxPairs;
}
}
if (m_gpuSmallSortedAabbs.size())
{
B3_PROFILE("sapKernel");
b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
b3LauncherCL launcher(m_queue, m_sapKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
launcher.setConst( axis );
launcher.setConst( maxPairs );
int num = numSmallAabbs;
#if 0
int buffSize = launcher.getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
for (int i=0;i<buffSize+1;i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
int actualWrite = launcher.serializeArguments(buf,buffSize);
unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize]==0xff);//check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen("m_sapKernelArgs.bin","wb");
fwrite(buf,buffSize+sizeof(int),1,f);
fclose(f);
#endif//
launcher.launch1D( num);
clFinish(m_queue);
numPairs = pairCount.at(0);
if (numPairs>maxPairs)
numPairs = maxPairs;
}
#else
int numPairs = 0;
b3LauncherCL launcher(m_queue, m_sapKernel);
const char* fileName = "m_sapKernelArgs.bin";
FILE* f = fopen(fileName,"rb");
if (f)
{
int sizeInBytes=0;
if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
{
printf("error, cannot get file size\n");
exit(0);
}
unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
fread(buf,sizeInBytes,1,f);
int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
int num = *(int*)&buf[serializedBytes];
launcher.launch1D( num);
b3OpenCLArray<int> pairCount(m_context, m_queue);
int numElements = launcher.m_arrays[2]->size()/sizeof(int);
pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
numPairs = pairCount.at(0);
//printf("overlapping pairs = %d\n",numPairs);
b3AlignedObjectArray<b3Int2> hostOoverlappingPairs;
b3OpenCLArray<b3Int2> tmpGpuPairs(m_context,m_queue);
tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
tmpGpuPairs.copyToHost(hostOoverlappingPairs);
m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
//printf("hello %d\n", m_overlappingPairs.size());
free(buf);
fclose(f);
} else {
printf("error: cannot find file %s\n",fileName);
}
clFinish(m_queue);
#endif
m_overlappingPairs.resize(numPairs);
}//B3_PROFILE("GPU_RADIX SORT");
}
void b3GpuSapBroadphase::writeAabbsToGpu()
{
m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
}
void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
{
int index = userPtr;
b3SapAabb aabb;
for (int i=0;i<4;i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
}
aabb.m_minIndices[3] = index;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_largeAabbsCPU.push_back(aabb);
m_allAabbsCPU.push_back(aabb);
}
void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
{
int index = userPtr;
b3SapAabb aabb;
for (int i=0;i<4;i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
}
aabb.m_minIndices[3] = index;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_smallAabbsCPU.push_back(aabb);
m_allAabbsCPU.push_back(aabb);
}
cl_mem b3GpuSapBroadphase::getAabbBufferWS()
{
return m_allAabbsGPU.getBufferCL();
}
int b3GpuSapBroadphase::getNumOverlap()
{
return m_overlappingPairs.size();
}
cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
{
return m_overlappingPairs.getBufferCL();
}

View File

@@ -0,0 +1,69 @@
#ifndef B3_GPU_SAP_BROADPHASE_H
#define B3_GPU_SAP_BROADPHASE_H
#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
class b3Vector3;
#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
#include "b3SapAabb.h"
class b3GpuSapBroadphase
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel ;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
class b3RadixSort32CL* m_sorter;
///test for 3d SAP
b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
int m_currentBuffer;
public:
b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
b3OpenCLArray<b3SapAabb> m_smallAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_smallAabbsCPU;
b3OpenCLArray<b3SapAabb> m_largeAabbsGPU;
b3AlignedObjectArray<b3SapAabb> m_largeAabbsCPU;
b3OpenCLArray<b3Int2> m_overlappingPairs;
//temporary gpu work memory
b3OpenCLArray<b3SortData> m_gpuSmallSortData;
b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
virtual ~b3GpuSapBroadphase();
void calculateOverlappingPairs();
void calculateOverlappingPairsHost();
void init3dSap();
void calculateOverlappingPairsHostIncremental3Sap();
void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
//call writeAabbsToGpu after done making all changes (createProxy etc)
void writeAabbsToGpu();
cl_mem getAabbBufferWS();
int getNumOverlap();
cl_mem getOverlappingPairBuffer();
};
#endif //B3_GPU_SAP_BROADPHASE_H

View File

@@ -0,0 +1,18 @@
#ifndef B3_SAP_AABB_H
#define B3_SAP_AABB_H
struct b3SapAabb
{
union
{
float m_min[4];
int m_minIndices[4];
};
union
{
float m_max[4];
int m_signedMaxIndices[4];
};
};
#endif //B3_SAP_AABB_H

View File

@@ -0,0 +1,320 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
/// conservative test for overlap between two aabbs
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numUnsortedAabbs)
return;
int j = get_global_id(1);
if (j>=numSortedAabbs)
return;
if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
{
int2 myPair;
myPair.x = unsortedAabbs[i].m_minIndices[3];
myPair.y = sortedAabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
for (int j=i+1;j<numObjects;j++)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
break;
}
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int2 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
}
__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int2 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
__local btAabbCL localAabbs[128];// = aabbs[i];
btAabbCL myAabb;
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
float testValue = myAabb.m_maxElems[axis];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
int localCount=0;
int block=0;
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
{
int2 myPair;
myPair.x = myAabb.m_minIndices[3];
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
localCount++;
if (localCount==64)
{
localCount = 0;
block+=64;
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
//http://stereopsis.com/radix.html
unsigned int FloatFlip(float fl);
unsigned int FloatFlip(float fl)
{
unsigned int f = *(unsigned int*)&fl;
unsigned int mask = -(int)(f >> 31) | 0x80000000;
return f ^ mask;
}
float IFloatFlip(unsigned int f);
float IFloatFlip(unsigned int f)
{
unsigned int mask = ((f >> 31) - 1) | 0x80000000;
unsigned int fl = f ^ mask;
return *(float*)&fl;
}
__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
int src = destAabbs[i].m_maxIndices[3];
destAabbs[i] = allAabbs[src];
destAabbs[i].m_maxIndices[3] = src;
}
__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
sortData[i].y = i;
}
__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortedAabbs[i] = aabbs[sortData[i].y];
}

View File

@@ -0,0 +1,161 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
/// conservative test for overlap between two aabbs
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
{
//skip pairs between static (mass=0) objects
if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
return false;
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
//computePairsKernelBatchWrite
__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
__local btAabbCL localAabbs[128];// = aabbs[i];
int2 myPairs[64];
btAabbCL myAabb;
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
float testValue = myAabb.m_maxElems[axis];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
int localCount=0;
int block=0;
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int curNumPairs = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
{
int2 myPair;
myPair.x = myAabb.m_minIndices[3];
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
myPairs[curNumPairs] = myPair;
curNumPairs++;
if (curNumPairs==64)
{
int curPair = atomic_add(pairCount,curNumPairs);
//avoid a buffer overrun
if ((curPair+curNumPairs)<maxPairs)
{
for (int p=0;p<curNumPairs;p++)
{
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
}
}
curNumPairs = 0;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
localCount++;
if (localCount==64)
{
localCount = 0;
block+=64;
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
if (curNumPairs>0)
{
//avoid a buffer overrun
int curPair = atomic_add(pairCount,curNumPairs);
if ((curPair+curNumPairs)<maxPairs)
{
for (int p=0;p<curNumPairs;p++)
{
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
}
}
curNumPairs = 0;
}
}

View File

@@ -0,0 +1,164 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapFastCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
"{\n"
"//skip pairs between static (mass=0) objects\n"
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
" return false;\n"
" \n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"\n"
"//computePairsKernelBatchWrite\n"
"__kernel void computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" int2 myPairs[64];\n"
" \n"
" b3AabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int curNumPairs = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPairs[curNumPairs] = myPair;\n"
" curNumPairs++;\n"
" if (curNumPairs==64)\n"
" {\n"
" int curPair = atomic_add(pairCount,curNumPairs);\n"
" //avoid a buffer overrun\n"
" if ((curPair+curNumPairs)<maxPairs)\n"
" {\n"
" for (int p=0;p<curNumPairs;p++)\n"
" {\n"
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
" }\n"
" }\n"
" curNumPairs = 0;\n"
" }\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
" \n"
" if (curNumPairs>0)\n"
" {\n"
" //avoid a buffer overrun\n"
" int curPair = atomic_add(pairCount,curNumPairs);\n"
" if ((curPair+curNumPairs)<maxPairs)\n"
" {\n"
" for (int p=0;p<curNumPairs;p++)\n"
" {\n"
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
" }\n"
" }\n"
" curNumPairs = 0;\n"
" }\n"
"}\n"
;

View File

@@ -0,0 +1,324 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} b3AabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
"\n"
" int j = get_global_id(1);\n"
" if (j>=numSortedAabbs)\n"
" return;\n"
"\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" \n"
" myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
" myPair.y = sortedAabbs[j].m_minIndices[3];\n"
"\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"\n"
"__kernel void computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__kernel void computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
"\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
"\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
"\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local b3AabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" b3AabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__kernel void copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"\n"
"\n"
"__kernel void flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"\n"
"\n"
"__kernel void scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
"\n"
" sortedAabbs[i] = aabbs[sortData[i].y];\n"
"}\n"
"\n"
;