CPU implementation of btCudaBroadphase added.

It is called bt3DGridBroadphase and btCudaBroadphase is now derived from it rater than from btSimpleBroadphase
Test of bt3DGridBroadphase was added to CDTestFramework
This commit is contained in:
rponom
2008-11-25 03:16:11 +00:00
parent 7dda192bfc
commit 09aa2dbbe7
10 changed files with 1247 additions and 839 deletions

View File

@@ -65,9 +65,9 @@
__device__ inline btCuda3F1U tex_fetch3F1U(float4 a) { return *((btCuda3F1U*)(&a)); }
#if B_CUDA_USE_TEX
#define FETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
#define BT3DGRIDFETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
#else
#define FETCH(t, i) t[i]
#define BT3DGRIDFETCH(t, i) t[i]
#endif
texture<uint2, 1, cudaReadModeElementType> particleHashTex;
@@ -80,323 +80,25 @@ __constant__ btCudaBroadphaseParams params;
//----------------------------------------------------------------------------------------
// calculate position in uniform grid
__device__ int3 btCuda_calcGridPos(float4 p)
{
int3 gridPos;
gridPos.x = floor((p.x - params.m_worldOriginX) / params.m_cellSizeX);
gridPos.y = floor((p.y - params.m_worldOriginY) / params.m_cellSizeY);
gridPos.z = floor((p.z - params.m_worldOriginZ) / params.m_cellSizeZ);
return gridPos;
}
#define BT3DGRID__device__ __device__
#define BT3DGRIDmax(a, b) max(a, b)
#define BT3DGRIDmin(a, b) min(a, b)
#define BT3DGRIDparams params
#define BT3DGRID__mul24(a, b) __mul24(a, b)
#define BT3DGRID__global__ __global__
#define BT3DGRID__shared__ __shared__
#define BT3DGRID__syncthreads() __syncthreads()
#define BT3DGRIDmake_uint2(x, y) make_uint2(x, y)
#define BT3DGRIDmake_int3(x, y, z) make_int3(x, y, z)
#define BT3DGRIDPREF(func) btCuda_##func
#define BT3DGPRDMemset cudaMemset
#define BT3DGRIDblockIdx blockIdx
#define BT3DGRIDblockDim blockDim
#define BT3DGRIDthreadIdx threadIdx
#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) kfunc<<<numb, numt>>>args
//----------------------------------------------------------------------------------------
// calculate address in grid from position (clamping to edges)
__device__ uint btCuda_calcGridHash(int3 gridPos)
{
gridPos.x = max(0, min(gridPos.x, params.m_gridSizeX - 1));
gridPos.y = max(0, min(gridPos.y, params.m_gridSizeY - 1));
gridPos.z = max(0, min(gridPos.z, params.m_gridSizeZ - 1));
return __mul24(__mul24(gridPos.z, params.m_gridSizeY), params.m_gridSizeX) + __mul24(gridPos.y, params.m_gridSizeX) + gridPos.x;
}
//----------------------------------------------------------------------------------------
// calculate grid hash value for each body using its AABB
__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
{
int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index*2];
btCuda3F1U bbMax = pAABB[index*2 + 1];
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
uint gridHash = btCuda_calcGridHash(gridPos);
// store grid hash and body index
pHash[index] = make_uint2(gridHash, index);
}
//----------------------------------------------------------------------------------------
__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
uint2 sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
__shared__ uint sharedHash[257];
sharedHash[threadIdx.x+1] = sortedData.x;
if((index > 0) && (threadIdx.x == 0))
{
// first thread in block must load neighbor body hash
volatile uint2 prevData = pHash[index-1];
sharedHash[0] = prevData.x;
}
__syncthreads();
if((index == 0) || (sortedData.x != sharedHash[threadIdx.x]))
{
cellStart[sortedData.x] = index;
}
}
//----------------------------------------------------------------------------------------
__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
{
return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) &&
(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) &&
(min0.fz <= max1.fz)&& (min1.fz <= max0.fz);
}
//----------------------------------------------------------------------------------------
__device__ void findPairsInCell(int3 gridPos,
uint index,
uint2* pHash,
uint* pCellStart,
btCuda3F1U* pAABB,
uint* pPairBuff,
uint2* pPairBuffStartCurr,
uint numBodies)
{
if ( (gridPos.x < 0) || (gridPos.x > params.m_gridSizeX - 1)
|| (gridPos.y < 0) || (gridPos.y > params.m_gridSizeY - 1)
|| (gridPos.z < 0) || (gridPos.z > params.m_gridSizeZ - 1))
{
return;
}
uint gridHash = btCuda_calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = pCellStart[gridHash];
if (bucketStart == 0xffffffff)
{
return; // cell empty
}
// iterate over bodies in this cell
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
uint bucketEnd = bucketStart + params.m_maxBodiesPerCell;
bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
for(uint index2 = bucketStart; index2 < bucketEnd; index2++)
{
uint2 cellData = pHash[index2];
if (cellData.x != gridHash)
{
break; // no longer in same bucket
}
uint unsorted_indx2 = cellData.y;
if (unsorted_indx2 < unsorted_indx) // check not colliding with self
{
btCuda3F1U min1 = FETCH(pAABB, unsorted_indx2*2);
btCuda3F1U max1 = FETCH(pAABB, unsorted_indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint handleIndex2 = min1.uw;
uint k;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
}
pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
__global__ void
findOverlappingPairsD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U bbMin = FETCH(pAABB, unsorted_indx*2);
btCuda3F1U bbMax = FETCH(pAABB, unsorted_indx*2 + 1);
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
// examine only neighbouring cells
for(int z=-1; z<=1; z++) {
for(int y=-1; y<=1; y++) {
for(int x=-1; x<=1; x++) {
findPairsInCell(gridPos + make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
}
}
}
}
//----------------------------------------------------------------------------------------
__global__ void
findPairsLargeD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
for(uint i = 0; i < numLarge; i++)
{
uint indx2 = numBodies + i;
btCuda3F1U min1 = FETCH(pAABB, indx2*2);
btCuda3F1U max1 = FETCH(pAABB, indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint k;
uint handleIndex2 = min1.uw;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint *pInp = pPairBuff + start;
uint num_changes = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
}
//----------------------------------------------------------------------------------------
__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if(index >= numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint* pInp = pPairBuff + start;
uint* pOut = pPairOut + pPairScan[index];
uint* pOut2 = pInp;
uint num = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
*pOut = *pInp;
pOut++;
}
if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
{
*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
pOut2++;
num++;
}
}
pPairBuffStartCurr[handleIndex] = make_uint2(start, num);
} // squeezeOverlappingPairBuffD()
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// E N D O F K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//! Check for CUDA error
# define CUT_CHECK_ERROR(errorMessage) do { \
cudaError_t err = cudaGetLastError(); \
@@ -430,9 +132,7 @@ __global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffSta
btCuda_exit(EXIT_FAILURE); \
} } while (0)
extern "C"
{
//----------------------------------------------------------------------------------------
void btCuda_exit(int val)
{
@@ -465,125 +165,9 @@ void btCuda_setParameters(btCudaBroadphaseParams* hostParams)
MY_CUDA_SAFE_CALL(cudaMemcpyToSymbol(params, hostParams, sizeof(btCudaBroadphaseParams)));
}
//Round a / b to nearest higher integer value
int btCuda_iDivUp(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
//----------------------------------------------------------------------------------------
// compute grid and thread block size for a given number of elements
void btCuda_computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
{
numThreads = min(blockSize, n);
numBlocks = btCuda_iDivUp(n, numThreads);
}
#include "bt3DGridBroadphaseFunc.h"
void btCuda_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
// execute the kernel
calcHashAABBD<<< numBlocks, numThreads >>>(pAABB, (uint2*)hash, numBodies);
// check if kernel invocation generated an error
CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
}
//----------------------------------------------------------------------------------------
void btCuda_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
findCellStartD<<< numBlocks, numThreads >>>((uint2*)hash, (uint*)cellStart, numBodies);
CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
}
void btCuda_findOverlappingPairs( btCuda3F1U* pAABB, unsigned int* pHash,
unsigned int* pCellStart,
unsigned int* pPairBuff,
unsigned int* pPairBuffStartCurr,
unsigned int numBodies)
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
findOverlappingPairsD<<< numBlocks, numThreads >>>(
pAABB,
(uint2*)pHash,
(uint*)pCellStart,
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
numBodies
);
CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
} // btCuda_findOverlappingPairs()
void btCuda_findPairsLarge( btCuda3F1U* pAABB, unsigned int* pHash,
unsigned int* pCellStart,
unsigned int* pPairBuff,
unsigned int* pPairBuffStartCurr,
unsigned int numBodies,
unsigned int numLarge)
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
findPairsLargeD<<< numBlocks, numThreads >>>(
pAABB,
(uint2*)pHash,
(uint*)pCellStart,
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
numBodies,
numLarge
);
CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
} // btCuda_findPairsLarge()
void btCuda_computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr,
unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
computePairCacheChangesD<<< numBlocks, numThreads >>>(
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
(uint*)pPairScan,
pAABB,
numBodies
);
CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
} // btCuda_computePairCacheChanges()
void btCuda_squeezeOverlappingPairBuff( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan,
unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies)
{
int numThreads, numBlocks;
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
squeezeOverlappingPairBuffD<<< numBlocks, numThreads >>>(
(uint*)pPairBuff,
(uint2*)pPairBuffStartCurr,
(uint*)pPairScan,
(uint*)pPairOut,
pAABB,
numBodies
);
CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
} // btCuda_squeezeOverlappingPairBuff()
} // extern "C"