add gpu_broadphase with basic test

This commit is contained in:
erwin coumans
2013-03-12 13:47:13 -07:00
parent 08272c7de5
commit 477a7f9e39
11 changed files with 1705 additions and 2 deletions

View File

@@ -0,0 +1,485 @@
#include "btGpuSapBroadphase.h"
#include "parallel_primitives/host/btVector3.h"
#include "parallel_primitives/host/btLauncherCL.h"
#include "parallel_primitives/host/btQuickprof.h"
#include "../basic_initialize/btOpenCLUtils.h"
#include "../kernels/integrateKernel.h"
#include "../kernels/sapKernels.h"
#include "../kernels/sapFastKernels.h"
#include "parallel_primitives/host/btMinMax.h"
btGpuSapBroadphase::btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
:m_context(ctx),
m_device(device),
m_queue(q),
m_allAabbsGPU(ctx,q),
m_smallAabbsGPU(ctx,q),
m_largeAabbsGPU(ctx,q),
m_overlappingPairs(ctx,q),
m_gpuSmallSortData(ctx,q),
m_gpuSmallSortedAabbs(ctx,q)
{
const char* sapSrc = sapCL;
const char* sapFastSrc = sapFastCL;
cl_int errNum=0;
cl_program sapProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl");
btAssert(errNum==CL_SUCCESS);
cl_program sapFastProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl");
btAssert(errNum==CL_SUCCESS);
//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
m_sap2Kernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
btAssert(errNum==CL_SUCCESS);
#if 0
m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
btAssert(errNum==CL_SUCCESS);
#else
#ifndef __APPLE__
m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
btAssert(errNum==CL_SUCCESS);
#else
m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
btAssert(errNum==CL_SUCCESS);
#endif
#endif
m_flipFloatKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
m_copyAabbsKernel= btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
m_scatterKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
m_sorter = new btRadixSort32CL(m_context,m_device,m_queue);
}
btGpuSapBroadphase::~btGpuSapBroadphase()
{
delete m_sorter;
clReleaseKernel(m_scatterKernel);
clReleaseKernel(m_flipFloatKernel);
clReleaseKernel(m_copyAabbsKernel);
clReleaseKernel(m_sapKernel);
clReleaseKernel(m_sap2Kernel);
}
/// conservative test for overlap between two aabbs
static bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aabbMax1,
const btVector3 &aabbMin2, const btVector3 &aabbMax2)
{
bool overlap = true;
overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
return overlap;
}
void btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
{
int axis = 0;//todo on GPU for now hardcode
btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
if (forceHost)
{
btAlignedObjectArray<btSapAabb> allHostAabbs;
m_allAabbsGPU.copyToHost(allHostAabbs);
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int j=0;j<numSmallAabbs;j++)
{
//sync aabb
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
{
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
//sync aabb
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
btAlignedObjectArray<btInt2> hostPairs;
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int i=0;i<numSmallAabbs;i++)
{
float reference = m_smallAabbsCPU[i].m_max[axis];
for (int j=i+1;j<numSmallAabbs;j++)
{
if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
(btVector3&)m_smallAabbsCPU[j].m_min,(btVector3&)m_smallAabbsCPU[j].m_max))
{
btInt2 pair;
pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
pair.y = m_smallAabbsCPU[j].m_minIndices[3];
hostPairs.push_back(pair);
}
}
}
}
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int i=0;i<numSmallAabbs;i++)
{
float reference = m_smallAabbsCPU[i].m_max[axis];
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
(btVector3&)m_largeAabbsCPU[j].m_min,(btVector3&)m_largeAabbsCPU[j].m_max))
{
btInt2 pair;
pair.x = m_largeAabbsCPU[j].m_minIndices[3];
pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
hostPairs.push_back(pair);
}
}
}
}
if (hostPairs.size())
{
m_overlappingPairs.copyFromHost(hostPairs);
} else
{
m_overlappingPairs.resize(0);
}
return;
}
{
bool syncOnHost = false;
if (syncOnHost)
{
BT_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
btAlignedObjectArray<btSapAabb> allHostAabbs;
m_allAabbsGPU.copyToHost(allHostAabbs);
m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
{
int numSmallAabbs = m_smallAabbsCPU.size();
for (int j=0;j<numSmallAabbs;j++)
{
//sync aabb
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
} else
{
{
int numSmallAabbs = m_smallAabbsGPU.size();
BT_PROFILE("copyAabbsKernelSmall");
btBufferInfoCL bInfo[] = {
btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
btBufferInfoCL( m_smallAabbsGPU.getBufferCL()),
};
btLauncherCL launcher(m_queue, m_copyAabbsKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( numSmallAabbs );
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
}
if (syncOnHost)
{
BT_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
btAlignedObjectArray<btSapAabb> allHostAabbs;
m_allAabbsGPU.copyToHost(allHostAabbs);
m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
{
int numLargeAabbs = m_largeAabbsCPU.size();
for (int j=0;j<numLargeAabbs;j++)
{
//sync aabb
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
}
}
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
} else
{
int numLargeAabbs = m_largeAabbsGPU.size();
if (numLargeAabbs)
{
BT_PROFILE("copyAabbsKernelLarge");
btBufferInfoCL bInfo[] = {
btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
btBufferInfoCL( m_largeAabbsGPU.getBufferCL()),
};
btLauncherCL launcher(m_queue, m_copyAabbsKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( numLargeAabbs );
int num = numLargeAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
}
BT_PROFILE("GPU SAP");
int numSmallAabbs = m_smallAabbsGPU.size();
m_gpuSmallSortData.resize(numSmallAabbs);
int numLargeAabbs = m_smallAabbsGPU.size();
#if 1
if (m_smallAabbsGPU.size())
{
BT_PROFILE("flipFloatKernel");
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL())};
btLauncherCL launcher(m_queue, m_flipFloatKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( numSmallAabbs );
launcher.setConst( axis );
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
{
BT_PROFILE("gpu radix sort\n");
m_sorter->execute(m_gpuSmallSortData);
clFinish(m_queue);
}
m_gpuSmallSortedAabbs.resize(numSmallAabbs);
if (numSmallAabbs)
{
BT_PROFILE("scatterKernel");
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),btBufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
btLauncherCL launcher(m_queue, m_scatterKernel );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( numSmallAabbs);
int num = numSmallAabbs;
launcher.launch1D( num);
clFinish(m_queue);
}
int maxPairsPerBody = 64;
int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
m_overlappingPairs.resize(maxPairs);
btOpenCLArray<int> pairCount(m_context, m_queue);
pairCount.push_back(0);
int numPairs=0;
{
int numLargeAabbs = m_largeAabbsGPU.size();
if (numLargeAabbs && numSmallAabbs)
{
BT_PROFILE("sap2Kernel");
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_largeAabbsGPU.getBufferCL() ),btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
btLauncherCL launcher(m_queue, m_sap2Kernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( numLargeAabbs );
launcher.setConst( numSmallAabbs);
launcher.setConst( axis );
launcher.setConst( maxPairs );
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
numPairs = pairCount.at(0);
if (numPairs >maxPairs)
numPairs =maxPairs;
}
}
if (m_gpuSmallSortedAabbs.size())
{
BT_PROFILE("sapKernel");
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
btLauncherCL launcher(m_queue, m_sapKernel);
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
launcher.setConst( numSmallAabbs );
launcher.setConst( axis );
launcher.setConst( maxPairs );
int num = numSmallAabbs;
#if 0
int buffSize = launcher.getSerializationBufferSize();
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
for (int i=0;i<buffSize+1;i++)
{
unsigned char* ptr = (unsigned char*)&buf[i];
*ptr = 0xff;
}
int actualWrite = launcher.serializeArguments(buf,buffSize);
unsigned char* cptr = (unsigned char*)&buf[buffSize];
// printf("buf[buffSize] = %d\n",*cptr);
assert(buf[buffSize]==0xff);//check for buffer overrun
int* ptr = (int*)&buf[buffSize];
*ptr = num;
FILE* f = fopen("m_sapKernelArgs.bin","wb");
fwrite(buf,buffSize+sizeof(int),1,f);
fclose(f);
#endif//
launcher.launch1D( num);
clFinish(m_queue);
numPairs = pairCount.at(0);
if (numPairs>maxPairs)
numPairs = maxPairs;
}
#else
int numPairs = 0;
btLauncherCL launcher(m_queue, m_sapKernel);
const char* fileName = "m_sapKernelArgs.bin";
FILE* f = fopen(fileName,"rb");
if (f)
{
int sizeInBytes=0;
if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
{
printf("error, cannot get file size\n");
exit(0);
}
unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
fread(buf,sizeInBytes,1,f);
int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
int num = *(int*)&buf[serializedBytes];
launcher.launch1D( num);
btOpenCLArray<int> pairCount(m_context, m_queue);
int numElements = launcher.m_arrays[2]->size()/sizeof(int);
pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
numPairs = pairCount.at(0);
//printf("overlapping pairs = %d\n",numPairs);
btAlignedObjectArray<btInt2> hostOoverlappingPairs;
btOpenCLArray<btInt2> tmpGpuPairs(m_context,m_queue);
tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
tmpGpuPairs.copyToHost(hostOoverlappingPairs);
m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
//printf("hello %d\n", m_overlappingPairs.size());
free(buf);
fclose(f);
} else {
printf("error: cannot find file %s\n",fileName);
}
clFinish(m_queue);
#endif
m_overlappingPairs.resize(numPairs);
}//BT_PROFILE("GPU_RADIX SORT");
}
void btGpuSapBroadphase::writeAabbsToGpu()
{
m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
}
void btGpuSapBroadphase::createLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
{
int index = userPtr;
btSapAabb aabb;
for (int i=0;i<4;i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
}
aabb.m_minIndices[3] = index;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_largeAabbsCPU.push_back(aabb);
m_allAabbsCPU.push_back(aabb);
}
void btGpuSapBroadphase::createProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
{
int index = userPtr;
btSapAabb aabb;
for (int i=0;i<4;i++)
{
aabb.m_min[i] = aabbMin[i];
aabb.m_max[i] = aabbMax[i];
}
aabb.m_minIndices[3] = index;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
m_smallAabbsCPU.push_back(aabb);
m_allAabbsCPU.push_back(aabb);
}
cl_mem btGpuSapBroadphase::getAabbBuffer()
{
return m_allAabbsGPU.getBufferCL();
}
int btGpuSapBroadphase::getNumOverlap()
{
return m_overlappingPairs.size();
}
cl_mem btGpuSapBroadphase::getOverlappingPairBuffer()
{
return m_overlappingPairs.getBufferCL();
}

View File

@@ -0,0 +1,74 @@
#ifndef BT_GPU_SAP_BROADPHASE_H
#define BT_GPU_SAP_BROADPHASE_H
#include "parallel_primitives/host/btOpenCLArray.h"
#include "parallel_primitives/host/btFillCL.h" //btInt2
class btVector3;
#include "parallel_primitives/host/btRadixSort32CL.h"
struct btSapAabb
{
union
{
float m_min[4];
int m_minIndices[4];
};
union
{
float m_max[4];
int m_signedMaxIndices[4];
//unsigned int m_unsignedMaxIndices[4];
};
};
class btGpuSapBroadphase
{
cl_context m_context;
cl_device_id m_device;
cl_command_queue m_queue;
cl_kernel m_flipFloatKernel;
cl_kernel m_scatterKernel ;
cl_kernel m_copyAabbsKernel;
cl_kernel m_sapKernel;
cl_kernel m_sap2Kernel;
class btRadixSort32CL* m_sorter;
public:
btOpenCLArray<btSapAabb> m_allAabbsGPU;
btAlignedObjectArray<btSapAabb> m_allAabbsCPU;
btOpenCLArray<btSapAabb> m_smallAabbsGPU;
btAlignedObjectArray<btSapAabb> m_smallAabbsCPU;
btOpenCLArray<btSapAabb> m_largeAabbsGPU;
btAlignedObjectArray<btSapAabb> m_largeAabbsCPU;
btOpenCLArray<btInt2> m_overlappingPairs;
//temporary gpu work memory
btOpenCLArray<btSortData> m_gpuSmallSortData;
btOpenCLArray<btSapAabb> m_gpuSmallSortedAabbs;
btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
virtual ~btGpuSapBroadphase();
void calculateOverlappingPairs(bool forceHost=false);
void createProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
void createLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
//call writeAabbsToGpu after done making all changes (createProxy etc)
void writeAabbsToGpu();
cl_mem getAabbBuffer();
int getNumOverlap();
cl_mem getOverlappingPairBuffer();
};
#endif //BT_GPU_SAP_BROADPHASE_H

View File

@@ -0,0 +1,320 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
/// conservative test for overlap between two aabbs
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
{
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numUnsortedAabbs)
return;
int j = get_global_id(1);
if (j>=numSortedAabbs)
return;
if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
{
int2 myPair;
myPair.x = unsortedAabbs[i].m_minIndices[3];
myPair.y = sortedAabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
for (int j=i+1;j<numObjects;j++)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
break;
}
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int2 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
}
__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int2 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
__local btAabbCL localAabbs[128];// = aabbs[i];
btAabbCL myAabb;
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
float testValue = myAabb.m_maxElems[axis];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
int localCount=0;
int block=0;
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
{
int2 myPair;
myPair.x = myAabb.m_minIndices[3];
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
localCount++;
if (localCount==64)
{
localCount = 0;
block+=64;
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
}
//http://stereopsis.com/radix.html
unsigned int FloatFlip(float fl);
unsigned int FloatFlip(float fl)
{
unsigned int f = *(unsigned int*)&fl;
unsigned int mask = -(int)(f >> 31) | 0x80000000;
return f ^ mask;
}
float IFloatFlip(unsigned int f);
float IFloatFlip(unsigned int f)
{
unsigned int mask = ((f >> 31) - 1) | 0x80000000;
unsigned int fl = f ^ mask;
return *(float*)&fl;
}
__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
int src = destAabbs[i].m_maxIndices[3];
destAabbs[i] = allAabbs[src];
destAabbs[i].m_maxIndices[3] = src;
}
__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
sortData[i].y = i;
}
__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
sortedAabbs[i] = aabbs[sortData[i].y];
}

View File

@@ -0,0 +1,161 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
//Originally written by Erwin Coumans
typedef struct
{
union
{
float4 m_min;
float m_minElems[4];
int m_minIndices[4];
};
union
{
float4 m_max;
float m_maxElems[4];
int m_maxIndices[4];
};
} btAabbCL;
/// conservative test for overlap between two aabbs
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
{
//skip pairs between static (mass=0) objects
if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
return false;
bool overlap = true;
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
return overlap;
}
//computePairsKernelBatchWrite
__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);
__local int numActiveWgItems[1];
__local int breakRequest[1];
__local btAabbCL localAabbs[128];// = aabbs[i];
int2 myPairs[64];
btAabbCL myAabb;
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
float testValue = myAabb.m_maxElems[axis];
if (localId==0)
{
numActiveWgItems[0] = 0;
breakRequest[0] = 0;
}
int localCount=0;
int block=0;
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
barrier(CLK_LOCAL_MEM_FENCE);
atomic_inc(numActiveWgItems);
barrier(CLK_LOCAL_MEM_FENCE);
int localBreak = 0;
int curNumPairs = 0;
int j=i+1;
do
{
barrier(CLK_LOCAL_MEM_FENCE);
if (j<numObjects)
{
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
{
if (!localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if (j>=numObjects && !localBreak)
{
atomic_inc(breakRequest);
localBreak = 1;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (!localBreak)
{
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
{
int2 myPair;
myPair.x = myAabb.m_minIndices[3];
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
myPairs[curNumPairs] = myPair;
curNumPairs++;
if (curNumPairs==64)
{
int curPair = atomic_add(pairCount,curNumPairs);
//avoid a buffer overrun
if ((curPair+curNumPairs)<maxPairs)
{
for (int p=0;p<curNumPairs;p++)
{
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
}
}
curNumPairs = 0;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
localCount++;
if (localCount==64)
{
localCount = 0;
block+=64;
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
}
j++;
} while (breakRequest[0]<numActiveWgItems[0]);
if (curNumPairs>0)
{
//avoid a buffer overrun
int curPair = atomic_add(pairCount,curNumPairs);
if ((curPair+curNumPairs)<maxPairs)
{
for (int p=0;p<curNumPairs;p++)
{
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
}
}
curNumPairs = 0;
}
}

View File

@@ -0,0 +1,164 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapFastCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
"//skip pairs between static (mass=0) objects\n"
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
" return false;\n"
" \n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"\n"
"//computePairsKernelBatchWrite\n"
"__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" int2 myPairs[64];\n"
" \n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" int curNumPairs = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" myPairs[curNumPairs] = myPair;\n"
" curNumPairs++;\n"
" if (curNumPairs==64)\n"
" {\n"
" int curPair = atomic_add(pairCount,curNumPairs);\n"
" //avoid a buffer overrun\n"
" if ((curPair+curNumPairs)<maxPairs)\n"
" {\n"
" for (int p=0;p<curNumPairs;p++)\n"
" {\n"
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
" }\n"
" }\n"
" curNumPairs = 0;\n"
" }\n"
" }\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
" \n"
" if (curNumPairs>0)\n"
" {\n"
" //avoid a buffer overrun\n"
" int curPair = atomic_add(pairCount,curNumPairs);\n"
" if ((curPair+curNumPairs)<maxPairs)\n"
" {\n"
" for (int p=0;p<curNumPairs;p++)\n"
" {\n"
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
" }\n"
" }\n"
" curNumPairs = 0;\n"
" }\n"
"}\n"
;

View File

@@ -0,0 +1,324 @@
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
static const char* sapCL= \
"/*\n"
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
"\n"
"This software is provided 'as-is', without any express or implied warranty.\n"
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
"Permission is granted to anyone to use this software for any purpose, \n"
"including commercial applications, and to alter it and redistribute it freely, \n"
"subject to the following restrictions:\n"
"\n"
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
"3. This notice may not be removed or altered from any source distribution.\n"
"*/\n"
"//Originally written by Erwin Coumans\n"
"\n"
"\n"
"typedef struct \n"
"{\n"
" union\n"
" {\n"
" float4 m_min;\n"
" float m_minElems[4];\n"
" int m_minIndices[4];\n"
" };\n"
" union\n"
" {\n"
" float4 m_max;\n"
" float m_maxElems[4];\n"
" int m_maxIndices[4];\n"
" };\n"
"} btAabbCL;\n"
"\n"
"\n"
"/// conservative test for overlap between two aabbs\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
"{\n"
" bool overlap = true;\n"
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
" return overlap;\n"
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numUnsortedAabbs)\n"
" return;\n"
"\n"
" int j = get_global_id(1);\n"
" if (j>=numSortedAabbs)\n"
" return;\n"
"\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" \n"
" myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
" myPair.y = sortedAabbs[j].m_minIndices[3];\n"
"\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
"}\n"
"\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" break;\n"
" }\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
"\n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
"\n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" j++;\n"
"\n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
"}\n"
"\n"
"\n"
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"
"\n"
" __local int numActiveWgItems[1];\n"
" __local int breakRequest[1];\n"
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
" \n"
" btAabbCL myAabb;\n"
" \n"
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
" float testValue = myAabb.m_maxElems[axis];\n"
" \n"
" if (localId==0)\n"
" {\n"
" numActiveWgItems[0] = 0;\n"
" breakRequest[0] = 0;\n"
" }\n"
" int localCount=0;\n"
" int block=0;\n"
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" atomic_inc(numActiveWgItems);\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" int localBreak = 0;\n"
" \n"
" int j=i+1;\n"
" do\n"
" {\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j<numObjects)\n"
" {\n"
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
" {\n"
" if (!localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (j>=numObjects && !localBreak)\n"
" {\n"
" atomic_inc(breakRequest);\n"
" localBreak = 1;\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" \n"
" if (!localBreak)\n"
" {\n"
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
" {\n"
" int2 myPair;\n"
" myPair.x = myAabb.m_minIndices[3];\n"
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
" \n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
"\n"
" localCount++;\n"
" if (localCount==64)\n"
" {\n"
" localCount = 0;\n"
" block+=64; \n"
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
" }\n"
" j++;\n"
" \n"
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
" \n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"//http://stereopsis.com/radix.html\n"
"unsigned int FloatFlip(float fl);\n"
"unsigned int FloatFlip(float fl)\n"
"{\n"
" unsigned int f = *(unsigned int*)&fl;\n"
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
" return f ^ mask;\n"
"}\n"
"float IFloatFlip(unsigned int f);\n"
"float IFloatFlip(unsigned int f)\n"
"{\n"
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
" unsigned int fl = f ^ mask;\n"
" return *(float*)&fl;\n"
"}\n"
"\n"
"\n"
"\n"
"\n"
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" int src = destAabbs[i].m_maxIndices[3];\n"
" destAabbs[i] = allAabbs[src];\n"
" destAabbs[i].m_maxIndices[3] = src;\n"
"}\n"
"\n"
"\n"
"__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" \n"
" sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
" sortData[i].y = i;\n"
" \n"
"}\n"
"\n"
"\n"
"__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
"\n"
" sortedAabbs[i] = aabbs[sortData[i].y];\n"
"}\n"
"\n"
;

View File

@@ -0,0 +1,123 @@
/*
Copyright (c) 2012 Advanced Micro Devices, Inc.
This software is provided 'as-is', without any express or implied warranty.
In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it freely,
subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include <stdio.h>
#include "../basic_initialize/btOpenCLUtils.h"
#include "../host/btGpuSapBroadphase.h"
#include "parallel_primitives/host/btVector3.h"
#include "parallel_primitives/host/btFillCL.h"
#include "parallel_primitives/host/btBoundSearchCL.h"
#include "parallel_primitives/host/btRadixSort32CL.h"
#include "parallel_primitives/host/btPrefixScanCL.h"
#include "parallel_primitives/host/CommandLineArgs.h"
#include "parallel_primitives/host/btMinMax.h"
int g_nPassed = 0;
int g_nFailed = 0;
bool g_testFailed = 0;
#define TEST_INIT g_testFailed = 0;
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
cl_context g_context=0;
cl_device_id g_device=0;
cl_command_queue g_queue =0;
const char* g_deviceName = 0;
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
{
void* glCtx=0;
void* glDC = 0;
int ciErrNum = 0;
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils::getNumDevices(g_context);
if (numDev>0)
{
btOpenCLDeviceInfo info;
g_device= btOpenCLUtils::getDevice(g_context,0);
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
btOpenCLUtils::printDeviceInfo(g_device);
btOpenCLUtils::getDeviceInfo(g_device,&info);
g_deviceName = info.m_deviceName;
}
}
void exitCL()
{
clReleaseCommandQueue(g_queue);
clReleaseContext(g_context);
}
inline void broadphaseTest()
{
TEST_INIT;
btGpuSapBroadphase* sap = new btGpuSapBroadphase(g_context,g_device,g_queue);
int group=1;
int mask=1;
btVector3 aabbMin(0,0,0);
btVector3 aabbMax(1,1,1);
int usrPtr = 1;
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
// aabbMin.setValue(2,2,2);
// aabbMax.setValue(3,3,3);
usrPtr = 2;
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
sap->writeAabbsToGpu();
sap->calculateOverlappingPairs();
int numOverlap = sap->getNumOverlap();
cl_mem buf = sap->getOverlappingPairBuffer();
TEST_ASSERT(numOverlap==1);
delete sap;
TEST_REPORT( "broadphaseTest" );
}
int main(int argc, char** argv)
{
int preferredDeviceIndex = -1;
int preferredPlatformIndex = -1;
CommandLineArgs args(argc, argv);
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
initCL(preferredDeviceIndex,preferredPlatformIndex);
broadphaseTest();
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
printf("End, press <enter>\n");
getchar();
exitCL();
}

View File

@@ -0,0 +1,46 @@
function createProject(vendor)
hasCL = findOpenCL(vendor)
if (hasCL) then
project ("OpenCL_broadphase_test_" .. vendor)
initOpenCL(vendor)
language "C++"
kind "ConsoleApp"
targetdir "../../../bin"
includedirs {"..","../.."}
files {
"main.cpp",
"../../basic_initialize/btOpenCLInclude.h",
"../../basic_initialize/btOpenCLUtils.cpp",
"../../basic_initialize/btOpenCLUtils.h",
"../host/btGpuSapBroadphase.cpp",
"../host/btGpuSapBroadphase.h",
"../../parallel_primitives/host/btFillCL.cpp",
"../../parallel_primitives/host/btFillCL.h",
"../../parallel_primitives/host/btBoundSearchCL.cpp",
"../../parallel_primitives/host/btBoundSearchCL.h",
"../../parallel_primitives/host/btPrefixScanCL.cpp",
"../../parallel_primitives/host/btPrefixScanCL.h",
"../../parallel_primitives/host/btRadixSort32CL.cpp",
"../../parallel_primitives/host/btRadixSort32CL.h",
"../../parallel_primitives/host/btAlignedAllocator.cpp",
"../../parallel_primitives/host/btAlignedAllocator.h",
"../../parallel_primitives/host/btAlignedObjectArray.h",
"../../parallel_primitives/host/btQuickprof.cpp",
"../../parallel_primitives/host/btQuickprof.h",
}
end
end
createProject("AMD")
createProject("Intel")
createProject("NVIDIA")
createProject("Apple")

View File

@@ -652,6 +652,8 @@ int main( int argc, char** argv)
printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n");
cl_platform_id platformId;
g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
// g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext);
@@ -674,7 +676,7 @@ int main( int argc, char** argv)
//srand(time(NULL));
srand(0); // presently deterministic
unsigned int num_elements = 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
unsigned int iterations = 10;
bool keys_only = true;