diff --git a/Demos3/GpuDemos/broadphase/PairBench.cpp b/Demos3/GpuDemos/broadphase/PairBench.cpp index 3a908a7eb..3ca624e05 100644 --- a/Demos3/GpuDemos/broadphase/PairBench.cpp +++ b/Demos3/GpuDemos/broadphase/PairBench.cpp @@ -104,7 +104,7 @@ static PairBench* sPairDemo = 0; #define BP_COMBO_INDEX 123 -static int curSelectedBroadphase = 0; +static int curSelectedBroadphase = 5; static BroadphaseEntry allBroadphases[]= { {"Gpu Grid",b3GpuGridBroadphase::CreateFunc}, diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp index e2e0526c5..8ee378e6d 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp +++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp @@ -20,8 +20,8 @@ b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_co m_device(device), m_queue(q), m_allAabbsGPU(ctx,q), -m_smallAabbsGPU(ctx,q), -m_largeAabbsGPU(ctx,q), +m_smallAabbsMappingGPU(ctx,q), +m_largeAabbsMappingGPU(ctx,q), m_pairCount(ctx,q), m_overlappingPairs(ctx,q), m_gpuSmallSortData(ctx,q), @@ -891,33 +891,19 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) m_allAabbsGPU.copyToHost(m_allAabbsCPU); - - //m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs); - - - int numSmallAabbs = m_smallAabbsCPU.size(); - { - - for (int j=0;jm_allAabbsCPU[m_smallAabbsMappingCPU[i]]; + + b3Vector3 maxAabb=b3MakeVector3(aabb.m_max[0],aabb.m_max[1],aabb.m_max[2]); + b3Vector3 minAabb=b3MakeVector3(aabb.m_min[0],aabb.m_min[1],aabb.m_min[2]); b3Vector3 centerAabb=(maxAabb+minAabb)*0.5f; s += centerAabb; @@ -932,34 +918,28 @@ void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs) } - { - int numLargeAabbs = m_largeAabbsCPU.size(); - for (int j=0;j hostPairs; { - int numSmallAabbs = m_smallAabbsCPU.size(); + int numSmallAabbs = m_smallAabbsMappingCPU.size(); for (int i=0;i=0) // return calculateOverlappingPairsHostIncremental3Sap(); @@ -1057,47 +1045,7 @@ void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs) bool syncOnHost = false; - if (syncOnHost) - { - B3_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)"); - - m_allAabbsGPU.copyToHost(m_allAabbsCPU); - - m_smallAabbsGPU.copyToHost(m_smallAabbsCPU); - { - int numSmallAabbs = m_smallAabbsCPU.size(); - for (int j=0;j m_sum2; b3OpenCLArray m_dst; - b3OpenCLArray m_smallAabbsGPU; - b3AlignedObjectArray m_smallAabbsCPU; + b3OpenCLArray m_smallAabbsMappingGPU; + b3AlignedObjectArray m_smallAabbsMappingCPU; - b3OpenCLArray m_largeAabbsGPU; - b3AlignedObjectArray m_largeAabbsCPU; + b3OpenCLArray m_largeAabbsMappingGPU; + b3AlignedObjectArray m_largeAabbsMappingCPU; + b3OpenCLArray m_overlappingPairs; //temporary gpu work memory diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl index 533dac6e5..0d7f7d6cb 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl @@ -63,7 +63,7 @@ bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* } -__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs) +__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const btAabbCL* sortedAabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs) { int i = get_global_id(0); if (i>=numUnsortedAabbs) @@ -73,11 +73,14 @@ __kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAa if (j>=numSortedAabbs) return; - if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j])) + + __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]]; + + if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,&sortedAabbs[j])) { int4 myPair; - int xIndex = unsortedAabbs[i].m_minIndices[3]; + int xIndex = unsortedAabbPtr[0].m_minIndices[3]; int yIndex = sortedAabbs[j].m_minIndices[3]; if (xIndex>yIndex) { @@ -346,36 +349,40 @@ __kernel void copyAabbsKernel( __global const btAabbCL* allAabbs, __global btA } -__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis) +__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global int2* sortData, int numObjects, int axis) { int i = get_global_id(0); if (i>=numObjects) return; - - sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]); - sortData[i].y = i; + + + sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]); + sortData[i].y = i; } -__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects) +__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects) { int i = get_global_id(0); if (i>=numObjects) return; - - sortedAabbs[i] = aabbs[sortData[i].y]; + + sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]]; } -__kernel void prepareSumVarianceKernel( __global const btAabbCL* aabbs, __global float4* sum, __global float4* sum2,int numAabbs) +__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs) { int i = get_global_id(0); if (i>numAabbs) return; + + btAabbCL smallAabb = allAabbs[smallAabbMapping[i]]; + float4 s; - s = (aabbs[i].m_max+aabbs[i].m_min)*0.5f; + s = (smallAabb.m_max+smallAabb.m_min)*0.5f; sum[i]=s; sum2[i]=s*s; } diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h index 7ff0d7f0d..d1d8d4960 100644 --- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h +++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h @@ -56,7 +56,7 @@ static const char* sapCL= \ " overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n" " return overlap;\n" "}\n" -"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n" +"__kernel void computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping, __global const btAabbCL* sortedAabbs, volatile __global int4* pairsOut,volatile __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n" "{\n" " int i = get_global_id(0);\n" " if (i>=numUnsortedAabbs)\n" @@ -64,11 +64,12 @@ static const char* sapCL= \ " int j = get_global_id(1);\n" " if (j>=numSortedAabbs)\n" " return;\n" -" if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n" +" __global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n" +" if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,&sortedAabbs[j]))\n" " {\n" " int4 myPair;\n" " \n" -" int xIndex = unsortedAabbs[i].m_minIndices[3];\n" +" int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n" " int yIndex = sortedAabbs[j].m_minIndices[3];\n" " if (xIndex>yIndex)\n" " {\n" @@ -305,30 +306,35 @@ static const char* sapCL= \ " destAabbs[i] = allAabbs[src];\n" " destAabbs[i].m_maxIndices[3] = src;\n" "}\n" -"__kernel void flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n" +"__kernel void flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global int2* sortData, int numObjects, int axis)\n" "{\n" " int i = get_global_id(0);\n" " if (i>=numObjects)\n" " return;\n" -" \n" -" sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n" -" sortData[i].y = i;\n" +" \n" +" \n" +" sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n" +" sortData[i].y = i;\n" " \n" "}\n" -"__kernel void scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" +"__kernel void scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n" "{\n" " int i = get_global_id(0);\n" " if (i>=numObjects)\n" " return;\n" -" sortedAabbs[i] = aabbs[sortData[i].y];\n" +" \n" +" sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n" "}\n" -"__kernel void prepareSumVarianceKernel( __global const btAabbCL* aabbs, __global float4* sum, __global float4* sum2,int numAabbs)\n" +"__kernel void prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n" "{\n" " int i = get_global_id(0);\n" " if (i>numAabbs)\n" " return;\n" +" \n" +" btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n" +" \n" " float4 s;\n" -" s = (aabbs[i].m_max+aabbs[i].m_min)*0.5f;\n" +" s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n" " sum[i]=s;\n" " sum2[i]=s*s; \n" "}\n"