add gpu_broadphase with basic test
This commit is contained in:
485
opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
Normal file
485
opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
Normal file
@@ -0,0 +1,485 @@
|
||||
|
||||
#include "btGpuSapBroadphase.h"
|
||||
#include "parallel_primitives/host/btVector3.h"
|
||||
#include "parallel_primitives/host/btLauncherCL.h"
|
||||
#include "parallel_primitives/host/btQuickprof.h"
|
||||
#include "../basic_initialize/btOpenCLUtils.h"
|
||||
|
||||
#include "../kernels/integrateKernel.h"
|
||||
|
||||
#include "../kernels/sapKernels.h"
|
||||
#include "../kernels/sapFastKernels.h"
|
||||
#include "parallel_primitives/host/btMinMax.h"
|
||||
|
||||
|
||||
btGpuSapBroadphase::btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
|
||||
:m_context(ctx),
|
||||
m_device(device),
|
||||
m_queue(q),
|
||||
m_allAabbsGPU(ctx,q),
|
||||
m_smallAabbsGPU(ctx,q),
|
||||
m_largeAabbsGPU(ctx,q),
|
||||
m_overlappingPairs(ctx,q),
|
||||
m_gpuSmallSortData(ctx,q),
|
||||
m_gpuSmallSortedAabbs(ctx,q)
|
||||
{
|
||||
const char* sapSrc = sapCL;
|
||||
const char* sapFastSrc = sapFastCL;
|
||||
|
||||
cl_int errNum=0;
|
||||
|
||||
cl_program sapProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl");
|
||||
btAssert(errNum==CL_SUCCESS);
|
||||
cl_program sapFastProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl");
|
||||
btAssert(errNum==CL_SUCCESS);
|
||||
|
||||
|
||||
//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
|
||||
//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
|
||||
//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
|
||||
|
||||
|
||||
m_sap2Kernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
|
||||
btAssert(errNum==CL_SUCCESS);
|
||||
|
||||
#if 0
|
||||
|
||||
m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
|
||||
btAssert(errNum==CL_SUCCESS);
|
||||
#else
|
||||
#ifndef __APPLE__
|
||||
m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
|
||||
btAssert(errNum==CL_SUCCESS);
|
||||
#else
|
||||
m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
|
||||
btAssert(errNum==CL_SUCCESS);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
m_flipFloatKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
|
||||
|
||||
m_copyAabbsKernel= btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
|
||||
|
||||
m_scatterKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
|
||||
|
||||
m_sorter = new btRadixSort32CL(m_context,m_device,m_queue);
|
||||
}
|
||||
|
||||
btGpuSapBroadphase::~btGpuSapBroadphase()
|
||||
{
|
||||
delete m_sorter;
|
||||
clReleaseKernel(m_scatterKernel);
|
||||
clReleaseKernel(m_flipFloatKernel);
|
||||
clReleaseKernel(m_copyAabbsKernel);
|
||||
clReleaseKernel(m_sapKernel);
|
||||
clReleaseKernel(m_sap2Kernel);
|
||||
|
||||
}
|
||||
|
||||
/// conservative test for overlap between two aabbs
|
||||
static bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aabbMax1,
|
||||
const btVector3 &aabbMin2, const btVector3 &aabbMax2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
|
||||
overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
|
||||
overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
void btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
|
||||
{
|
||||
int axis = 0;//todo on GPU for now hardcode
|
||||
|
||||
btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
|
||||
|
||||
|
||||
if (forceHost)
|
||||
{
|
||||
|
||||
btAlignedObjectArray<btSapAabb> allHostAabbs;
|
||||
m_allAabbsGPU.copyToHost(allHostAabbs);
|
||||
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int j=0;j<numSmallAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
|
||||
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsCPU.size();
|
||||
for (int j=0;j<numLargeAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
|
||||
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
btAlignedObjectArray<btInt2> hostPairs;
|
||||
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int i=0;i<numSmallAabbs;i++)
|
||||
{
|
||||
float reference = m_smallAabbsCPU[i].m_max[axis];
|
||||
|
||||
for (int j=i+1;j<numSmallAabbs;j++)
|
||||
{
|
||||
if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
|
||||
(btVector3&)m_smallAabbsCPU[j].m_min,(btVector3&)m_smallAabbsCPU[j].m_max))
|
||||
{
|
||||
btInt2 pair;
|
||||
pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
|
||||
pair.y = m_smallAabbsCPU[j].m_minIndices[3];
|
||||
hostPairs.push_back(pair);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int i=0;i<numSmallAabbs;i++)
|
||||
{
|
||||
float reference = m_smallAabbsCPU[i].m_max[axis];
|
||||
int numLargeAabbs = m_largeAabbsCPU.size();
|
||||
|
||||
for (int j=0;j<numLargeAabbs;j++)
|
||||
{
|
||||
if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
|
||||
(btVector3&)m_largeAabbsCPU[j].m_min,(btVector3&)m_largeAabbsCPU[j].m_max))
|
||||
{
|
||||
btInt2 pair;
|
||||
pair.x = m_largeAabbsCPU[j].m_minIndices[3];
|
||||
pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
|
||||
hostPairs.push_back(pair);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (hostPairs.size())
|
||||
{
|
||||
m_overlappingPairs.copyFromHost(hostPairs);
|
||||
} else
|
||||
{
|
||||
m_overlappingPairs.resize(0);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
{
|
||||
|
||||
bool syncOnHost = false;
|
||||
|
||||
if (syncOnHost)
|
||||
{
|
||||
BT_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
|
||||
btAlignedObjectArray<btSapAabb> allHostAabbs;
|
||||
m_allAabbsGPU.copyToHost(allHostAabbs);
|
||||
|
||||
m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsCPU.size();
|
||||
for (int j=0;j<numSmallAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
|
||||
m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
}
|
||||
}
|
||||
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
|
||||
|
||||
} else
|
||||
{
|
||||
{
|
||||
int numSmallAabbs = m_smallAabbsGPU.size();
|
||||
BT_PROFILE("copyAabbsKernelSmall");
|
||||
btBufferInfoCL bInfo[] = {
|
||||
btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
|
||||
btBufferInfoCL( m_smallAabbsGPU.getBufferCL()),
|
||||
};
|
||||
|
||||
btLauncherCL launcher(m_queue, m_copyAabbsKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs );
|
||||
int num = numSmallAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
}
|
||||
|
||||
if (syncOnHost)
|
||||
{
|
||||
BT_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
|
||||
btAlignedObjectArray<btSapAabb> allHostAabbs;
|
||||
m_allAabbsGPU.copyToHost(allHostAabbs);
|
||||
|
||||
m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsCPU.size();
|
||||
for (int j=0;j<numLargeAabbs;j++)
|
||||
{
|
||||
//sync aabb
|
||||
int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
|
||||
m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
|
||||
m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
|
||||
}
|
||||
}
|
||||
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
|
||||
|
||||
} else
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsGPU.size();
|
||||
|
||||
if (numLargeAabbs)
|
||||
{
|
||||
BT_PROFILE("copyAabbsKernelLarge");
|
||||
btBufferInfoCL bInfo[] = {
|
||||
btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ),
|
||||
btBufferInfoCL( m_largeAabbsGPU.getBufferCL()),
|
||||
};
|
||||
|
||||
btLauncherCL launcher(m_queue, m_copyAabbsKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( numLargeAabbs );
|
||||
int num = numLargeAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
BT_PROFILE("GPU SAP");
|
||||
|
||||
int numSmallAabbs = m_smallAabbsGPU.size();
|
||||
m_gpuSmallSortData.resize(numSmallAabbs);
|
||||
int numLargeAabbs = m_smallAabbsGPU.size();
|
||||
|
||||
#if 1
|
||||
if (m_smallAabbsGPU.size())
|
||||
{
|
||||
BT_PROFILE("flipFloatKernel");
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL())};
|
||||
btLauncherCL launcher(m_queue, m_flipFloatKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs );
|
||||
launcher.setConst( axis );
|
||||
|
||||
int num = numSmallAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
{
|
||||
BT_PROFILE("gpu radix sort\n");
|
||||
m_sorter->execute(m_gpuSmallSortData);
|
||||
clFinish(m_queue);
|
||||
}
|
||||
|
||||
m_gpuSmallSortedAabbs.resize(numSmallAabbs);
|
||||
if (numSmallAabbs)
|
||||
{
|
||||
BT_PROFILE("scatterKernel");
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),btBufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
|
||||
btLauncherCL launcher(m_queue, m_scatterKernel );
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs);
|
||||
int num = numSmallAabbs;
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
|
||||
}
|
||||
|
||||
|
||||
int maxPairsPerBody = 64;
|
||||
int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
|
||||
m_overlappingPairs.resize(maxPairs);
|
||||
|
||||
btOpenCLArray<int> pairCount(m_context, m_queue);
|
||||
pairCount.push_back(0);
|
||||
int numPairs=0;
|
||||
|
||||
{
|
||||
int numLargeAabbs = m_largeAabbsGPU.size();
|
||||
if (numLargeAabbs && numSmallAabbs)
|
||||
{
|
||||
BT_PROFILE("sap2Kernel");
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_largeAabbsGPU.getBufferCL() ),btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
|
||||
btLauncherCL launcher(m_queue, m_sap2Kernel);
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( numLargeAabbs );
|
||||
launcher.setConst( numSmallAabbs);
|
||||
launcher.setConst( axis );
|
||||
launcher.setConst( maxPairs );
|
||||
//@todo: use actual maximum work item sizes of the device instead of hardcoded values
|
||||
launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
|
||||
|
||||
numPairs = pairCount.at(0);
|
||||
if (numPairs >maxPairs)
|
||||
numPairs =maxPairs;
|
||||
|
||||
}
|
||||
}
|
||||
if (m_gpuSmallSortedAabbs.size())
|
||||
{
|
||||
BT_PROFILE("sapKernel");
|
||||
btBufferInfoCL bInfo[] = { btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
|
||||
btLauncherCL launcher(m_queue, m_sapKernel);
|
||||
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
|
||||
launcher.setConst( numSmallAabbs );
|
||||
launcher.setConst( axis );
|
||||
launcher.setConst( maxPairs );
|
||||
|
||||
|
||||
int num = numSmallAabbs;
|
||||
#if 0
|
||||
int buffSize = launcher.getSerializationBufferSize();
|
||||
unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
|
||||
for (int i=0;i<buffSize+1;i++)
|
||||
{
|
||||
unsigned char* ptr = (unsigned char*)&buf[i];
|
||||
*ptr = 0xff;
|
||||
}
|
||||
int actualWrite = launcher.serializeArguments(buf,buffSize);
|
||||
|
||||
unsigned char* cptr = (unsigned char*)&buf[buffSize];
|
||||
// printf("buf[buffSize] = %d\n",*cptr);
|
||||
|
||||
assert(buf[buffSize]==0xff);//check for buffer overrun
|
||||
int* ptr = (int*)&buf[buffSize];
|
||||
|
||||
*ptr = num;
|
||||
|
||||
FILE* f = fopen("m_sapKernelArgs.bin","wb");
|
||||
fwrite(buf,buffSize+sizeof(int),1,f);
|
||||
fclose(f);
|
||||
#endif//
|
||||
|
||||
launcher.launch1D( num);
|
||||
clFinish(m_queue);
|
||||
|
||||
numPairs = pairCount.at(0);
|
||||
if (numPairs>maxPairs)
|
||||
numPairs = maxPairs;
|
||||
}
|
||||
|
||||
#else
|
||||
int numPairs = 0;
|
||||
|
||||
|
||||
btLauncherCL launcher(m_queue, m_sapKernel);
|
||||
|
||||
const char* fileName = "m_sapKernelArgs.bin";
|
||||
FILE* f = fopen(fileName,"rb");
|
||||
if (f)
|
||||
{
|
||||
int sizeInBytes=0;
|
||||
if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
|
||||
{
|
||||
printf("error, cannot get file size\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
|
||||
fread(buf,sizeInBytes,1,f);
|
||||
int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
|
||||
int num = *(int*)&buf[serializedBytes];
|
||||
launcher.launch1D( num);
|
||||
|
||||
btOpenCLArray<int> pairCount(m_context, m_queue);
|
||||
int numElements = launcher.m_arrays[2]->size()/sizeof(int);
|
||||
pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
|
||||
numPairs = pairCount.at(0);
|
||||
//printf("overlapping pairs = %d\n",numPairs);
|
||||
btAlignedObjectArray<btInt2> hostOoverlappingPairs;
|
||||
btOpenCLArray<btInt2> tmpGpuPairs(m_context,m_queue);
|
||||
tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
|
||||
|
||||
tmpGpuPairs.copyToHost(hostOoverlappingPairs);
|
||||
m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
|
||||
//printf("hello %d\n", m_overlappingPairs.size());
|
||||
free(buf);
|
||||
fclose(f);
|
||||
|
||||
} else {
|
||||
printf("error: cannot find file %s\n",fileName);
|
||||
}
|
||||
|
||||
clFinish(m_queue);
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
m_overlappingPairs.resize(numPairs);
|
||||
|
||||
}//BT_PROFILE("GPU_RADIX SORT");
|
||||
|
||||
}
|
||||
|
||||
void btGpuSapBroadphase::writeAabbsToGpu()
|
||||
{
|
||||
m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
|
||||
m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
|
||||
m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
|
||||
|
||||
}
|
||||
|
||||
void btGpuSapBroadphase::createLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
|
||||
{
|
||||
int index = userPtr;
|
||||
btSapAabb aabb;
|
||||
for (int i=0;i<4;i++)
|
||||
{
|
||||
aabb.m_min[i] = aabbMin[i];
|
||||
aabb.m_max[i] = aabbMax[i];
|
||||
}
|
||||
aabb.m_minIndices[3] = index;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
|
||||
m_largeAabbsCPU.push_back(aabb);
|
||||
m_allAabbsCPU.push_back(aabb);
|
||||
}
|
||||
|
||||
void btGpuSapBroadphase::createProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
|
||||
{
|
||||
int index = userPtr;
|
||||
btSapAabb aabb;
|
||||
for (int i=0;i<4;i++)
|
||||
{
|
||||
aabb.m_min[i] = aabbMin[i];
|
||||
aabb.m_max[i] = aabbMax[i];
|
||||
}
|
||||
aabb.m_minIndices[3] = index;
|
||||
aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
|
||||
m_smallAabbsCPU.push_back(aabb);
|
||||
m_allAabbsCPU.push_back(aabb);
|
||||
}
|
||||
|
||||
cl_mem btGpuSapBroadphase::getAabbBuffer()
|
||||
{
|
||||
return m_allAabbsGPU.getBufferCL();
|
||||
}
|
||||
|
||||
int btGpuSapBroadphase::getNumOverlap()
|
||||
{
|
||||
return m_overlappingPairs.size();
|
||||
}
|
||||
cl_mem btGpuSapBroadphase::getOverlappingPairBuffer()
|
||||
{
|
||||
return m_overlappingPairs.getBufferCL();
|
||||
}
|
||||
74
opencl/gpu_broadphase/host/btGpuSapBroadphase.h
Normal file
74
opencl/gpu_broadphase/host/btGpuSapBroadphase.h
Normal file
@@ -0,0 +1,74 @@
|
||||
#ifndef BT_GPU_SAP_BROADPHASE_H
|
||||
#define BT_GPU_SAP_BROADPHASE_H
|
||||
|
||||
#include "parallel_primitives/host/btOpenCLArray.h"
|
||||
#include "parallel_primitives/host/btFillCL.h" //btInt2
|
||||
class btVector3;
|
||||
#include "parallel_primitives/host/btRadixSort32CL.h"
|
||||
|
||||
struct btSapAabb
|
||||
{
|
||||
union
|
||||
{
|
||||
float m_min[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float m_max[4];
|
||||
int m_signedMaxIndices[4];
|
||||
//unsigned int m_unsignedMaxIndices[4];
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
|
||||
class btGpuSapBroadphase
|
||||
{
|
||||
|
||||
cl_context m_context;
|
||||
cl_device_id m_device;
|
||||
cl_command_queue m_queue;
|
||||
cl_kernel m_flipFloatKernel;
|
||||
cl_kernel m_scatterKernel ;
|
||||
cl_kernel m_copyAabbsKernel;
|
||||
cl_kernel m_sapKernel;
|
||||
cl_kernel m_sap2Kernel;
|
||||
|
||||
class btRadixSort32CL* m_sorter;
|
||||
|
||||
public:
|
||||
|
||||
btOpenCLArray<btSapAabb> m_allAabbsGPU;
|
||||
btAlignedObjectArray<btSapAabb> m_allAabbsCPU;
|
||||
|
||||
btOpenCLArray<btSapAabb> m_smallAabbsGPU;
|
||||
btAlignedObjectArray<btSapAabb> m_smallAabbsCPU;
|
||||
|
||||
btOpenCLArray<btSapAabb> m_largeAabbsGPU;
|
||||
btAlignedObjectArray<btSapAabb> m_largeAabbsCPU;
|
||||
|
||||
btOpenCLArray<btInt2> m_overlappingPairs;
|
||||
|
||||
//temporary gpu work memory
|
||||
btOpenCLArray<btSortData> m_gpuSmallSortData;
|
||||
btOpenCLArray<btSapAabb> m_gpuSmallSortedAabbs;
|
||||
|
||||
|
||||
btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
|
||||
virtual ~btGpuSapBroadphase();
|
||||
|
||||
void calculateOverlappingPairs(bool forceHost=false);
|
||||
|
||||
void createProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
|
||||
void createLargeProxy(const btVector3& aabbMin, const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
|
||||
|
||||
//call writeAabbsToGpu after done making all changes (createProxy etc)
|
||||
void writeAabbsToGpu();
|
||||
|
||||
cl_mem getAabbBuffer();
|
||||
int getNumOverlap();
|
||||
cl_mem getOverlappingPairBuffer();
|
||||
};
|
||||
|
||||
#endif //BT_GPU_SAP_BROADPHASE_H
|
||||
320
opencl/gpu_broadphase/kernels/sap.cl
Normal file
320
opencl/gpu_broadphase/kernels/sap.cl
Normal file
@@ -0,0 +1,320 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Erwin Coumans
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
float4 m_min;
|
||||
float m_minElems[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float4 m_max;
|
||||
float m_maxElems[4];
|
||||
int m_maxIndices[4];
|
||||
};
|
||||
} btAabbCL;
|
||||
|
||||
|
||||
/// conservative test for overlap between two aabbs
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
|
||||
{
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
|
||||
__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numUnsortedAabbs)
|
||||
return;
|
||||
|
||||
int j = get_global_id(1);
|
||||
if (j>=numSortedAabbs)
|
||||
return;
|
||||
|
||||
if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
|
||||
{
|
||||
int2 myPair;
|
||||
|
||||
myPair.x = unsortedAabbs[i].m_minIndices[3];
|
||||
myPair.y = sortedAabbs[j].m_minIndices[3];
|
||||
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
for (int j=i+1;j<numObjects;j++)
|
||||
{
|
||||
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
|
||||
{
|
||||
break;
|
||||
}
|
||||
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = aabbs[i].m_minIndices[3];
|
||||
myPair.y = aabbs[j].m_minIndices[3];
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
int localId = get_local_id(0);
|
||||
|
||||
__local int numActiveWgItems[1];
|
||||
__local int breakRequest[1];
|
||||
|
||||
if (localId==0)
|
||||
{
|
||||
numActiveWgItems[0] = 0;
|
||||
breakRequest[0] = 0;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_inc(numActiveWgItems);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int localBreak = 0;
|
||||
|
||||
int j=i+1;
|
||||
do
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j<numObjects)
|
||||
{
|
||||
if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis]))
|
||||
{
|
||||
if (!localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j>=numObjects && !localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (!localBreak)
|
||||
{
|
||||
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = aabbs[i].m_minIndices[3];
|
||||
myPair.y = aabbs[j].m_minIndices[3];
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
j++;
|
||||
|
||||
} while (breakRequest[0]<numActiveWgItems[0]);
|
||||
}
|
||||
|
||||
|
||||
__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
int localId = get_local_id(0);
|
||||
|
||||
__local int numActiveWgItems[1];
|
||||
__local int breakRequest[1];
|
||||
__local btAabbCL localAabbs[128];// = aabbs[i];
|
||||
|
||||
btAabbCL myAabb;
|
||||
|
||||
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
|
||||
float testValue = myAabb.m_maxElems[axis];
|
||||
|
||||
if (localId==0)
|
||||
{
|
||||
numActiveWgItems[0] = 0;
|
||||
breakRequest[0] = 0;
|
||||
}
|
||||
int localCount=0;
|
||||
int block=0;
|
||||
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_inc(numActiveWgItems);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int localBreak = 0;
|
||||
|
||||
int j=i+1;
|
||||
do
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j<numObjects)
|
||||
{
|
||||
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
|
||||
{
|
||||
if (!localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j>=numObjects && !localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (!localBreak)
|
||||
{
|
||||
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = myAabb.m_minIndices[3];
|
||||
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
|
||||
int curPair = atomic_inc (pairCount);
|
||||
if (curPair<maxPairs)
|
||||
{
|
||||
pairsOut[curPair] = myPair; //flush to main memory
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localCount++;
|
||||
if (localCount==64)
|
||||
{
|
||||
localCount = 0;
|
||||
block+=64;
|
||||
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
|
||||
}
|
||||
j++;
|
||||
|
||||
} while (breakRequest[0]<numActiveWgItems[0]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
//http://stereopsis.com/radix.html
|
||||
unsigned int FloatFlip(float fl);
|
||||
unsigned int FloatFlip(float fl)
|
||||
{
|
||||
unsigned int f = *(unsigned int*)&fl;
|
||||
unsigned int mask = -(int)(f >> 31) | 0x80000000;
|
||||
return f ^ mask;
|
||||
}
|
||||
float IFloatFlip(unsigned int f);
|
||||
float IFloatFlip(unsigned int f)
|
||||
{
|
||||
unsigned int mask = ((f >> 31) - 1) | 0x80000000;
|
||||
unsigned int fl = f ^ mask;
|
||||
return *(float*)&fl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
int src = destAabbs[i].m_maxIndices[3];
|
||||
destAabbs[i] = allAabbs[src];
|
||||
destAabbs[i].m_maxIndices[3] = src;
|
||||
}
|
||||
|
||||
|
||||
__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
|
||||
sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
|
||||
sortData[i].y = i;
|
||||
|
||||
}
|
||||
|
||||
|
||||
__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
if (i>=numObjects)
|
||||
return;
|
||||
|
||||
sortedAabbs[i] = aabbs[sortData[i].y];
|
||||
}
|
||||
161
opencl/gpu_broadphase/kernels/sapFast.cl
Normal file
161
opencl/gpu_broadphase/kernels/sapFast.cl
Normal file
@@ -0,0 +1,161 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
//Originally written by Erwin Coumans
|
||||
|
||||
|
||||
typedef struct
|
||||
{
|
||||
union
|
||||
{
|
||||
float4 m_min;
|
||||
float m_minElems[4];
|
||||
int m_minIndices[4];
|
||||
};
|
||||
union
|
||||
{
|
||||
float4 m_max;
|
||||
float m_maxElems[4];
|
||||
int m_maxIndices[4];
|
||||
};
|
||||
} btAabbCL;
|
||||
|
||||
|
||||
/// conservative test for overlap between two aabbs
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
|
||||
bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
|
||||
{
|
||||
//skip pairs between static (mass=0) objects
|
||||
if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
|
||||
return false;
|
||||
|
||||
bool overlap = true;
|
||||
overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
|
||||
overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
|
||||
overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
|
||||
return overlap;
|
||||
}
|
||||
|
||||
|
||||
//computePairsKernelBatchWrite
|
||||
__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
int localId = get_local_id(0);
|
||||
|
||||
__local int numActiveWgItems[1];
|
||||
__local int breakRequest[1];
|
||||
__local btAabbCL localAabbs[128];// = aabbs[i];
|
||||
|
||||
int2 myPairs[64];
|
||||
|
||||
btAabbCL myAabb;
|
||||
|
||||
myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
|
||||
float testValue = myAabb.m_maxElems[axis];
|
||||
|
||||
if (localId==0)
|
||||
{
|
||||
numActiveWgItems[0] = 0;
|
||||
breakRequest[0] = 0;
|
||||
}
|
||||
int localCount=0;
|
||||
int block=0;
|
||||
localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
atomic_inc(numActiveWgItems);
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
int localBreak = 0;
|
||||
int curNumPairs = 0;
|
||||
|
||||
int j=i+1;
|
||||
do
|
||||
{
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j<numObjects)
|
||||
{
|
||||
if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis]))
|
||||
{
|
||||
if (!localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (j>=numObjects && !localBreak)
|
||||
{
|
||||
atomic_inc(breakRequest);
|
||||
localBreak = 1;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
if (!localBreak)
|
||||
{
|
||||
if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
|
||||
{
|
||||
int2 myPair;
|
||||
myPair.x = myAabb.m_minIndices[3];
|
||||
myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
|
||||
myPairs[curNumPairs] = myPair;
|
||||
curNumPairs++;
|
||||
if (curNumPairs==64)
|
||||
{
|
||||
int curPair = atomic_add(pairCount,curNumPairs);
|
||||
//avoid a buffer overrun
|
||||
if ((curPair+curNumPairs)<maxPairs)
|
||||
{
|
||||
for (int p=0;p<curNumPairs;p++)
|
||||
{
|
||||
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
|
||||
}
|
||||
}
|
||||
curNumPairs = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
localCount++;
|
||||
if (localCount==64)
|
||||
{
|
||||
localCount = 0;
|
||||
block+=64;
|
||||
localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
|
||||
localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
|
||||
}
|
||||
j++;
|
||||
|
||||
} while (breakRequest[0]<numActiveWgItems[0]);
|
||||
|
||||
|
||||
if (curNumPairs>0)
|
||||
{
|
||||
//avoid a buffer overrun
|
||||
int curPair = atomic_add(pairCount,curNumPairs);
|
||||
if ((curPair+curNumPairs)<maxPairs)
|
||||
{
|
||||
for (int p=0;p<curNumPairs;p++)
|
||||
{
|
||||
pairsOut[curPair+p] = myPairs[p]; //flush to main memory
|
||||
}
|
||||
}
|
||||
curNumPairs = 0;
|
||||
}
|
||||
}
|
||||
164
opencl/gpu_broadphase/kernels/sapFastKernels.h
Normal file
164
opencl/gpu_broadphase/kernels/sapFastKernels.h
Normal file
@@ -0,0 +1,164 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* sapFastCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Erwin Coumans\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} btAabbCL;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
"//skip pairs between static (mass=0) objects\n"
|
||||
" if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
|
||||
" return false;\n"
|
||||
" \n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"//computePairsKernelBatchWrite\n"
|
||||
"__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" int2 myPairs[64];\n"
|
||||
" \n"
|
||||
" btAabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
" \n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" int localCount=0;\n"
|
||||
" int block=0;\n"
|
||||
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" int curNumPairs = 0;\n"
|
||||
" \n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = myAabb.m_minIndices[3];\n"
|
||||
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
|
||||
" myPairs[curNumPairs] = myPair;\n"
|
||||
" curNumPairs++;\n"
|
||||
" if (curNumPairs==64)\n"
|
||||
" {\n"
|
||||
" int curPair = atomic_add(pairCount,curNumPairs);\n"
|
||||
" //avoid a buffer overrun\n"
|
||||
" if ((curPair+curNumPairs)<maxPairs)\n"
|
||||
" {\n"
|
||||
" for (int p=0;p<curNumPairs;p++)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" curNumPairs = 0;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" localCount++;\n"
|
||||
" if (localCount==64)\n"
|
||||
" {\n"
|
||||
" localCount = 0;\n"
|
||||
" block+=64; \n"
|
||||
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" \n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
" \n"
|
||||
" \n"
|
||||
" if (curNumPairs>0)\n"
|
||||
" {\n"
|
||||
" //avoid a buffer overrun\n"
|
||||
" int curPair = atomic_add(pairCount,curNumPairs);\n"
|
||||
" if ((curPair+curNumPairs)<maxPairs)\n"
|
||||
" {\n"
|
||||
" for (int p=0;p<curNumPairs;p++)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" curNumPairs = 0;\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
;
|
||||
324
opencl/gpu_broadphase/kernels/sapKernels.h
Normal file
324
opencl/gpu_broadphase/kernels/sapKernels.h
Normal file
@@ -0,0 +1,324 @@
|
||||
//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
|
||||
static const char* sapCL= \
|
||||
"/*\n"
|
||||
"Copyright (c) 2012 Advanced Micro Devices, Inc. \n"
|
||||
"\n"
|
||||
"This software is provided 'as-is', without any express or implied warranty.\n"
|
||||
"In no event will the authors be held liable for any damages arising from the use of this software.\n"
|
||||
"Permission is granted to anyone to use this software for any purpose, \n"
|
||||
"including commercial applications, and to alter it and redistribute it freely, \n"
|
||||
"subject to the following restrictions:\n"
|
||||
"\n"
|
||||
"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
|
||||
"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
|
||||
"3. This notice may not be removed or altered from any source distribution.\n"
|
||||
"*/\n"
|
||||
"//Originally written by Erwin Coumans\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"typedef struct \n"
|
||||
"{\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_min;\n"
|
||||
" float m_minElems[4];\n"
|
||||
" int m_minIndices[4];\n"
|
||||
" };\n"
|
||||
" union\n"
|
||||
" {\n"
|
||||
" float4 m_max;\n"
|
||||
" float m_maxElems[4];\n"
|
||||
" int m_maxIndices[4];\n"
|
||||
" };\n"
|
||||
"} btAabbCL;\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/// conservative test for overlap between two aabbs\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
|
||||
"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
|
||||
"{\n"
|
||||
" bool overlap = true;\n"
|
||||
" overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
|
||||
" overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
|
||||
" return overlap;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numUnsortedAabbs)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" int j = get_global_id(1);\n"
|
||||
" if (j>=numSortedAabbs)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" \n"
|
||||
" myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = sortedAabbs[j].m_minIndices[3];\n"
|
||||
"\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" for (int j=i+1;j<numObjects;j++)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" break;\n"
|
||||
" }\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
"\n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
"\n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = aabbs[i].m_minIndices[3];\n"
|
||||
" myPair.y = aabbs[j].m_minIndices[3];\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
"\n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" int localId = get_local_id(0);\n"
|
||||
"\n"
|
||||
" __local int numActiveWgItems[1];\n"
|
||||
" __local int breakRequest[1];\n"
|
||||
" __local btAabbCL localAabbs[128];// = aabbs[i];\n"
|
||||
" \n"
|
||||
" btAabbCL myAabb;\n"
|
||||
" \n"
|
||||
" myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
|
||||
" float testValue = myAabb.m_maxElems[axis];\n"
|
||||
" \n"
|
||||
" if (localId==0)\n"
|
||||
" {\n"
|
||||
" numActiveWgItems[0] = 0;\n"
|
||||
" breakRequest[0] = 0;\n"
|
||||
" }\n"
|
||||
" int localCount=0;\n"
|
||||
" int block=0;\n"
|
||||
" localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" atomic_inc(numActiveWgItems);\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" int localBreak = 0;\n"
|
||||
" \n"
|
||||
" int j=i+1;\n"
|
||||
" do\n"
|
||||
" {\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j<numObjects)\n"
|
||||
" {\n"
|
||||
" if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
|
||||
" {\n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (j>=numObjects && !localBreak)\n"
|
||||
" {\n"
|
||||
" atomic_inc(breakRequest);\n"
|
||||
" localBreak = 1;\n"
|
||||
" }\n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
" \n"
|
||||
" if (!localBreak)\n"
|
||||
" {\n"
|
||||
" if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
|
||||
" {\n"
|
||||
" int2 myPair;\n"
|
||||
" myPair.x = myAabb.m_minIndices[3];\n"
|
||||
" myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
|
||||
" int curPair = atomic_inc (pairCount);\n"
|
||||
" if (curPair<maxPairs)\n"
|
||||
" {\n"
|
||||
" pairsOut[curPair] = myPair; //flush to main memory\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" }\n"
|
||||
" \n"
|
||||
" barrier(CLK_LOCAL_MEM_FENCE);\n"
|
||||
"\n"
|
||||
" localCount++;\n"
|
||||
" if (localCount==64)\n"
|
||||
" {\n"
|
||||
" localCount = 0;\n"
|
||||
" block+=64; \n"
|
||||
" localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
|
||||
" localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
|
||||
" }\n"
|
||||
" j++;\n"
|
||||
" \n"
|
||||
" } while (breakRequest[0]<numActiveWgItems[0]);\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"//http://stereopsis.com/radix.html\n"
|
||||
"unsigned int FloatFlip(float fl);\n"
|
||||
"unsigned int FloatFlip(float fl)\n"
|
||||
"{\n"
|
||||
" unsigned int f = *(unsigned int*)&fl;\n"
|
||||
" unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
|
||||
" return f ^ mask;\n"
|
||||
"}\n"
|
||||
"float IFloatFlip(unsigned int f);\n"
|
||||
"float IFloatFlip(unsigned int f)\n"
|
||||
"{\n"
|
||||
" unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
|
||||
" unsigned int fl = f ^ mask;\n"
|
||||
" return *(float*)&fl;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" int src = destAabbs[i].m_maxIndices[3];\n"
|
||||
" destAabbs[i] = allAabbs[src];\n"
|
||||
" destAabbs[i].m_maxIndices[3] = src;\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
" \n"
|
||||
" sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
|
||||
" sortData[i].y = i;\n"
|
||||
" \n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
|
||||
"{\n"
|
||||
" int i = get_global_id(0);\n"
|
||||
" if (i>=numObjects)\n"
|
||||
" return;\n"
|
||||
"\n"
|
||||
" sortedAabbs[i] = aabbs[sortData[i].y];\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
;
|
||||
123
opencl/gpu_broadphase/test/main.cpp
Normal file
123
opencl/gpu_broadphase/test/main.cpp
Normal file
@@ -0,0 +1,123 @@
|
||||
/*
|
||||
Copyright (c) 2012 Advanced Micro Devices, Inc.
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the use of this software.
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it freely,
|
||||
subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
|
||||
#include <stdio.h>
|
||||
#include "../basic_initialize/btOpenCLUtils.h"
|
||||
#include "../host/btGpuSapBroadphase.h"
|
||||
#include "parallel_primitives/host/btVector3.h"
|
||||
#include "parallel_primitives/host/btFillCL.h"
|
||||
#include "parallel_primitives/host/btBoundSearchCL.h"
|
||||
#include "parallel_primitives/host/btRadixSort32CL.h"
|
||||
#include "parallel_primitives/host/btPrefixScanCL.h"
|
||||
#include "parallel_primitives/host/CommandLineArgs.h"
|
||||
|
||||
#include "parallel_primitives/host/btMinMax.h"
|
||||
int g_nPassed = 0;
|
||||
int g_nFailed = 0;
|
||||
bool g_testFailed = 0;
|
||||
|
||||
#define TEST_INIT g_testFailed = 0;
|
||||
#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
|
||||
#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
|
||||
#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
|
||||
|
||||
cl_context g_context=0;
|
||||
cl_device_id g_device=0;
|
||||
cl_command_queue g_queue =0;
|
||||
const char* g_deviceName = 0;
|
||||
|
||||
void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
|
||||
{
|
||||
void* glCtx=0;
|
||||
void* glDC = 0;
|
||||
int ciErrNum = 0;
|
||||
//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
|
||||
|
||||
cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
|
||||
|
||||
g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
int numDev = btOpenCLUtils::getNumDevices(g_context);
|
||||
if (numDev>0)
|
||||
{
|
||||
btOpenCLDeviceInfo info;
|
||||
g_device= btOpenCLUtils::getDevice(g_context,0);
|
||||
g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
btOpenCLUtils::printDeviceInfo(g_device);
|
||||
btOpenCLUtils::getDeviceInfo(g_device,&info);
|
||||
g_deviceName = info.m_deviceName;
|
||||
}
|
||||
}
|
||||
|
||||
void exitCL()
|
||||
{
|
||||
clReleaseCommandQueue(g_queue);
|
||||
clReleaseContext(g_context);
|
||||
}
|
||||
|
||||
|
||||
inline void broadphaseTest()
|
||||
{
|
||||
TEST_INIT;
|
||||
|
||||
btGpuSapBroadphase* sap = new btGpuSapBroadphase(g_context,g_device,g_queue);
|
||||
int group=1;
|
||||
int mask=1;
|
||||
btVector3 aabbMin(0,0,0);
|
||||
btVector3 aabbMax(1,1,1);
|
||||
int usrPtr = 1;
|
||||
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
|
||||
|
||||
// aabbMin.setValue(2,2,2);
|
||||
// aabbMax.setValue(3,3,3);
|
||||
usrPtr = 2;
|
||||
sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
|
||||
sap->writeAabbsToGpu();
|
||||
|
||||
sap->calculateOverlappingPairs();
|
||||
|
||||
int numOverlap = sap->getNumOverlap();
|
||||
cl_mem buf = sap->getOverlappingPairBuffer();
|
||||
|
||||
TEST_ASSERT(numOverlap==1);
|
||||
|
||||
delete sap;
|
||||
|
||||
TEST_REPORT( "broadphaseTest" );
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
int preferredDeviceIndex = -1;
|
||||
int preferredPlatformIndex = -1;
|
||||
|
||||
CommandLineArgs args(argc, argv);
|
||||
args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
|
||||
args.GetCmdLineArgument("platformId", preferredPlatformIndex);
|
||||
|
||||
initCL(preferredDeviceIndex,preferredPlatformIndex);
|
||||
|
||||
|
||||
broadphaseTest();
|
||||
|
||||
printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
|
||||
printf("End, press <enter>\n");
|
||||
getchar();
|
||||
|
||||
exitCL();
|
||||
|
||||
}
|
||||
|
||||
46
opencl/gpu_broadphase/test/premake4.lua
Normal file
46
opencl/gpu_broadphase/test/premake4.lua
Normal file
@@ -0,0 +1,46 @@
|
||||
function createProject(vendor)
|
||||
hasCL = findOpenCL(vendor)
|
||||
|
||||
if (hasCL) then
|
||||
|
||||
project ("OpenCL_broadphase_test_" .. vendor)
|
||||
|
||||
initOpenCL(vendor)
|
||||
|
||||
language "C++"
|
||||
|
||||
kind "ConsoleApp"
|
||||
targetdir "../../../bin"
|
||||
includedirs {"..","../.."}
|
||||
|
||||
|
||||
files {
|
||||
"main.cpp",
|
||||
"../../basic_initialize/btOpenCLInclude.h",
|
||||
"../../basic_initialize/btOpenCLUtils.cpp",
|
||||
"../../basic_initialize/btOpenCLUtils.h",
|
||||
"../host/btGpuSapBroadphase.cpp",
|
||||
"../host/btGpuSapBroadphase.h",
|
||||
"../../parallel_primitives/host/btFillCL.cpp",
|
||||
"../../parallel_primitives/host/btFillCL.h",
|
||||
"../../parallel_primitives/host/btBoundSearchCL.cpp",
|
||||
"../../parallel_primitives/host/btBoundSearchCL.h",
|
||||
"../../parallel_primitives/host/btPrefixScanCL.cpp",
|
||||
"../../parallel_primitives/host/btPrefixScanCL.h",
|
||||
"../../parallel_primitives/host/btRadixSort32CL.cpp",
|
||||
"../../parallel_primitives/host/btRadixSort32CL.h",
|
||||
"../../parallel_primitives/host/btAlignedAllocator.cpp",
|
||||
"../../parallel_primitives/host/btAlignedAllocator.h",
|
||||
"../../parallel_primitives/host/btAlignedObjectArray.h",
|
||||
"../../parallel_primitives/host/btQuickprof.cpp",
|
||||
"../../parallel_primitives/host/btQuickprof.h",
|
||||
|
||||
}
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
createProject("AMD")
|
||||
createProject("Intel")
|
||||
createProject("NVIDIA")
|
||||
createProject("Apple")
|
||||
@@ -652,6 +652,8 @@ int main( int argc, char** argv)
|
||||
printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n");
|
||||
cl_platform_id platformId;
|
||||
g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
// g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
|
||||
|
||||
oclCHECKERROR(ciErrNum, CL_SUCCESS);
|
||||
|
||||
int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext);
|
||||
@@ -674,7 +676,7 @@ int main( int argc, char** argv)
|
||||
//srand(time(NULL));
|
||||
srand(0); // presently deterministic
|
||||
|
||||
unsigned int num_elements = 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
|
||||
unsigned int num_elements = 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
|
||||
unsigned int iterations = 10;
|
||||
bool keys_only = true;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user