allow PairBenchmark to select broadphase type, with cpu brute-force, gpu brute-force etc

fix issue in PairBench, related to index offset
allow to add a large AABB in PairBench, to see the effect on the broadphase pair search performance
This commit is contained in:
erwin coumans
2014-01-28 17:11:56 -08:00
parent 71f0537c6e
commit 33ebebd1c9
10 changed files with 413 additions and 114 deletions

View File

@@ -116,7 +116,7 @@ void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
m_allAabbsCPU1.push_back(aabb);
m_smallAabbsCPU.push_back(aabb);
}
@@ -126,7 +126,7 @@ void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Ve
aabb.m_minVec = aabbMin;
aabb.m_maxVec = aabbMax;
aabb.m_minIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = userPtr;
aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
m_allAabbsCPU1.push_back(aabb);
m_largeAabbsCPU.push_back(aabb);
}
@@ -139,7 +139,7 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
if (0)
{
calculateOverlappingPairsHost(maxPairs);
/*
b3AlignedObjectArray<b3Int4> cpuPairs;
m_gpuPairs.copyToHost(cpuPairs);
printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
@@ -147,25 +147,53 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
{
printf("host pair %d = %d,%d\n",i,cpuPairs[i].x,cpuPairs[i].y);
}
*/
return;
}
//sync small AABBs
{
int numSmallAabbs = m_smallAabbsGPU.size();
if (numSmallAabbs)
{
B3_PROFILE("copyAabbsKernelSmall");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU1.getBufferCL(), true ),
b3BufferInfoCL( m_smallAabbsGPU.getBufferCL()),
};
b3LauncherCL launcher(m_queue, m_copyAabbsKernel,"m_copyAabbsKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
int num = numSmallAabbs;
launcher.launch1D( num);
bool syncOnHost = false;
if (syncOnHost)
{
m_allAabbsGPU1.copyToHost(this->m_allAabbsCPU1);
b3AlignedObjectArray<b3SapAabb> hostSmallAabbs;
m_smallAabbsGPU.copyToHost(hostSmallAabbs);
int numSmallAabbs = hostSmallAabbs.size();
for (int i=0;i<numSmallAabbs;i++)
{
//__kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
{
//int i = get_global_id(0);
//if (i>=numObjects)
// return;
int src = hostSmallAabbs[i].m_signedMaxIndices[3];
hostSmallAabbs[i] = m_allAabbsCPU1[src];
hostSmallAabbs[i].m_signedMaxIndices[3] = src;
}
}
m_smallAabbsGPU.copyFromHost(hostSmallAabbs);
} else
{
int numSmallAabbs = m_smallAabbsGPU.size();
if (numSmallAabbs)
{
B3_PROFILE("copyAabbsKernelSmall");
b3BufferInfoCL bInfo[] = {
b3BufferInfoCL( m_allAabbsGPU1.getBufferCL(), true ),
b3BufferInfoCL( m_smallAabbsGPU.getBufferCL()),
};
b3LauncherCL launcher(m_queue, m_copyAabbsKernel,"m_copyAabbsKernel" );
launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
launcher.setConst( numSmallAabbs );
int num = numSmallAabbs;
launcher.launch1D( num);
}
}
}
//sync large AABBs
@@ -328,19 +356,19 @@ void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
}
void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
{
#if 0
m_hostPairs.resize(0);
m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
for (int i=0;i<m_allAabbsCPU.size();i++)
for (int i=0;i<m_allAabbsCPU1.size();i++)
{
for (int j=i+1;j<m_allAabbsCPU.size();j++)
for (int j=i+1;j<m_allAabbsCPU1.size();j++)
{
if (b3TestAabbAgainstAabb2(m_allAabbsCPU[i].m_minVec, m_allAabbsCPU[i].m_maxVec,
m_allAabbsCPU[j].m_minVec,m_allAabbsCPU[j].m_maxVec))
if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec))
{
b3Int4 pair;
int a = m_allAabbsCPU[j].m_minIndices[3];
int b = m_allAabbsCPU[i].m_minIndices[3];
int a = m_allAabbsCPU1[j].m_minIndices[3];
int b = m_allAabbsCPU1[i].m_minIndices[3];
if (a<=b)
{
pair.x = a;
@@ -361,7 +389,7 @@ void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
m_gpuPairs.copyFromHost(m_hostPairs);
#endif
}

View File

@@ -15,7 +15,7 @@ bool searchIncremental3dSapOnGpu = true;
#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q )
b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType)
:m_context(ctx),
m_device(device),
m_queue(q),
@@ -64,10 +64,50 @@ m_removedCountGPU(ctx,q)
#else
m_prefixScanFloat4 = 0;
#endif
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
m_sapKernel = 0;
switch (kernelType)
{
case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU:
{
m_sapKernel=0;
break;
}
case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU:
{
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBruteForce",&errNum,sapProg );
break;
}
case B3_GPU_SAP_KERNEL_ORIGINAL:
{
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
break;
}
case B3_GPU_SAP_KERNEL_BARRIER:
{
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
break;
}
case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY:
{
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
break;
}
case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE:
{
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
break;
}
default:
{
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory");
}
};
m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
@@ -78,21 +118,21 @@ m_removedCountGPU(ctx,q)
m_computePairsIncremental3dSapKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsIncremental3dSapKernel",&errNum,sapFastProg );
b3Assert(errNum==CL_SUCCESS);
/*
#if 0
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#else
#ifndef __APPLE__
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
b3Assert(errNum==CL_SUCCESS);
#else
m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
b3Assert(errNum==CL_SUCCESS);
#endif
#endif
*/
m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
@@ -980,7 +1020,7 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
m_overlappingPairs.resize(0);
}
init3dSap();
//init3dSap();
}
@@ -999,9 +1039,15 @@ void b3GpuSapBroadphase::reset()
void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
{
if (m_sapKernel==0)
{
calculateOverlappingPairsHost(maxPairs);
return;
}
//if (m_currentBuffer>=0)
// return calculateOverlappingPairsHostIncremental3Sap();
//calculateOverlappingPairsHost(maxPairs);
B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs");

View File

@@ -11,6 +11,7 @@ class b3Vector3;
#include "b3GpuBroadphaseInterface.h"
class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
{
@@ -87,12 +88,44 @@ public:
class b3PrefixScanFloat4CL* m_prefixScanFloat4;
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q );
enum b3GpuSapKernelType
{
B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1,
B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
B3_GPU_SAP_KERNEL_ORIGINAL,
B3_GPU_SAP_KERNEL_BARRIER,
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY,
B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE
};
b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
virtual ~b3GpuSapBroadphase();
static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue q)
static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q);
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
}
static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
}
static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL);
}
static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER);
}
static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
}
static b3GpuBroadphaseInterface* CreateFuncLocalMemoryBatchWrite(cl_context ctx,cl_device_id device, cl_command_queue q)
{
return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE);
}
virtual void calculateOverlappingPairs(int maxPairs);

View File

@@ -100,6 +100,32 @@ __kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAa
}
}
__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
if (i>=numObjects)
return;
for (int j=i+1;j<numObjects;j++)
{
if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
{
int4 myPair;
myPair.x = aabbs[i].m_minIndices[3];
myPair.y = aabbs[j].m_minIndices[3];
myPair.z = NEW_PAIR_MARKER;
myPair.w = NEW_PAIR_MARKER;
int curPair = atomic_inc (pairCount);
if (curPair<maxPairs)
{
pairsOut[curPair] = myPair; //flush to main memory
}
}
}
}
__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);

View File

@@ -328,8 +328,8 @@ __kernel void computePairsIncremental3dSapKernel( __global const uint2* object
}
//computePairsKernelBatchWrite
__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
__kernel void computePairsKernelLocalSharedMemoryBatchWrite( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)
{
int i = get_global_id(0);
int localId = get_local_id(0);

View File

@@ -296,8 +296,7 @@ static const char* sapFastCL= \
" }//for (int otherbuffer\n"
" }//for (int axis=0;\n"
"}\n"
"//computePairsKernelBatchWrite\n"
"__kernel void computePairsKernel( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"__kernel void computePairsKernelLocalSharedMemoryBatchWrite( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" int localId = get_local_id(0);\n"

View File

@@ -88,6 +88,28 @@ static const char* sapCL= \
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"
" if (i>=numObjects)\n"
" return;\n"
" for (int j=i+1;j<numObjects;j++)\n"
" {\n"
" if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
" {\n"
" int4 myPair;\n"
" myPair.x = aabbs[i].m_minIndices[3];\n"
" myPair.y = aabbs[j].m_minIndices[3];\n"
" myPair.z = NEW_PAIR_MARKER;\n"
" myPair.w = NEW_PAIR_MARKER;\n"
" int curPair = atomic_inc (pairCount);\n"
" if (curPair<maxPairs)\n"
" {\n"
" pairsOut[curPair] = myPair; //flush to main memory\n"
" }\n"
" }\n"
" }\n"
"}\n"
"__kernel void computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
"{\n"
" int i = get_global_id(0);\n"