CPU implementation of btCudaBroadphase added.

It is called bt3DGridBroadphase and btCudaBroadphase is now derived from it rater than from btSimpleBroadphase
Test of bt3DGridBroadphase was added to CDTestFramework
This commit is contained in:
rponom
2008-11-25 03:16:11 +00:00
parent 7dda192bfc
commit 09aa2dbbe7
10 changed files with 1247 additions and 839 deletions

View File

@@ -0,0 +1,444 @@
/*
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO USER:
*
* This source code is subject to NVIDIA ownership rights under U.S. and
* international Copyright laws.
*
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
* OR PERFORMANCE OF THIS SOURCE CODE.
*
* U.S. Government End Users. This source code is a "commercial item" as
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
* "commercial computer software" and "commercial computer software
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
* and is provided to the U.S. Government only as a commercial end item.
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
* source code with only those rights set forth herein.
*/
#include "btCudaBroadphaseKernel.h"
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// calculate position in uniform grid
BT3DGRID__device__ int3 btCuda_calcGridPos(float4 p)
{
int3 gridPos;
gridPos.x = (int)floor((p.x - BT3DGRIDparams.m_worldOriginX) / BT3DGRIDparams.m_cellSizeX);
gridPos.y = (int)floor((p.y - BT3DGRIDparams.m_worldOriginY) / BT3DGRIDparams.m_cellSizeY);
gridPos.z = (int)floor((p.z - BT3DGRIDparams.m_worldOriginZ) / BT3DGRIDparams.m_cellSizeZ);
return gridPos;
}
//----------------------------------------------------------------------------------------
// calculate address in grid from position (clamping to edges)
BT3DGRID__device__ uint btCuda_calcGridHash(int3 gridPos)
{
gridPos.x = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.x, (int)BT3DGRIDparams.m_gridSizeX - 1));
gridPos.y = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.y, (int)BT3DGRIDparams.m_gridSizeY - 1));
gridPos.z = BT3DGRIDmax(0, BT3DGRIDmin(gridPos.z, (int)BT3DGRIDparams.m_gridSizeZ - 1));
return BT3DGRID__mul24(BT3DGRID__mul24(gridPos.z, BT3DGRIDparams.m_gridSizeY), BT3DGRIDparams.m_gridSizeX) + BT3DGRID__mul24(gridPos.y, BT3DGRIDparams.m_gridSizeX) + gridPos.x;
}
//----------------------------------------------------------------------------------------
// calculate grid hash value for each body using its AABB
BT3DGRID__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index*2];
btCuda3F1U bbMax = pAABB[index*2 + 1];
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
uint gridHash = btCuda_calcGridHash(gridPos);
// store grid hash and body index
pHash[index] = BT3DGRIDmake_uint2(gridHash, index);
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
// Load hash data into shared memory so that we can look
// at neighboring body's hash value without loading
// two hash values per thread
BT3DGRID__shared__ uint sharedHash[257];
sharedHash[BT3DGRIDthreadIdx.x+1] = sortedData.x;
if((index > 0) && (BT3DGRIDthreadIdx.x == 0))
{
// first thread in block must load neighbor body hash
volatile uint2 prevData = pHash[index-1];
sharedHash[0] = prevData.x;
}
BT3DGRID__syncthreads();
if((index == 0) || (sortedData.x != sharedHash[BT3DGRIDthreadIdx.x]))
{
cellStart[sortedData.x] = index;
}
}
//----------------------------------------------------------------------------------------
BT3DGRID__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
{
return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) &&
(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) &&
(min0.fz <= max1.fz)&& (min1.fz <= max0.fz);
}
//----------------------------------------------------------------------------------------
BT3DGRID__device__ void findPairsInCell(int3 gridPos,
uint index,
uint2* pHash,
uint* pCellStart,
btCuda3F1U* pAABB,
uint* pPairBuff,
uint2* pPairBuffStartCurr,
uint numBodies)
{
if ( (gridPos.x < 0) || (gridPos.x > (int)BT3DGRIDparams.m_gridSizeX - 1)
|| (gridPos.y < 0) || (gridPos.y > (int)BT3DGRIDparams.m_gridSizeY - 1)
|| (gridPos.z < 0) || (gridPos.z > (int)BT3DGRIDparams.m_gridSizeZ - 1))
{
return;
}
uint gridHash = btCuda_calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = pCellStart[gridHash];
if (bucketStart == 0xffffffff)
{
return; // cell empty
}
// iterate over bodies in this cell
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
uint bucketEnd = bucketStart + BT3DGRIDparams.m_maxBodiesPerCell;
bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
for(uint index2 = bucketStart; index2 < bucketEnd; index2++)
{
uint2 cellData = pHash[index2];
if (cellData.x != gridHash)
{
break; // no longer in same bucket
}
uint unsorted_indx2 = cellData.y;
if (unsorted_indx2 < unsorted_indx) // check not colliding with self
{
btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2);
btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, unsorted_indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint handleIndex2 = min1.uw;
uint k;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
}
pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void
findOverlappingPairsD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U bbMin = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
btCuda3F1U bbMax = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
float4 pos;
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
// get address in grid
int3 gridPos = btCuda_calcGridPos(pos);
// examine only neighbouring cells
for(int z=-1; z<=1; z++) {
for(int y=-1; y<=1; y++) {
for(int x=-1; x<=1; x++) {
findPairsInCell(gridPos + BT3DGRIDmake_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
}
}
}
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void
findPairsLargeD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
uint2 sortedData = pHash[index];
uint unsorted_indx = sortedData.y;
btCuda3F1U min0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2);
btCuda3F1U max0 = BT3DGRIDFETCH(pAABB, unsorted_indx*2 + 1);
uint handleIndex = min0.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
uint curr_max = start_curr_next.x - start - 1;
for(uint i = 0; i < numLarge; i++)
{
uint indx2 = numBodies + i;
btCuda3F1U min1 = BT3DGRIDFETCH(pAABB, indx2*2);
btCuda3F1U max1 = BT3DGRIDFETCH(pAABB, indx2*2 + 1);
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint k;
uint handleIndex2 = min1.uw;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == handleIndex2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
if(curr >= curr_max)
{ // not a good solution, but let's avoid crash
break;
}
curr++;
}
}
}
pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, curr);
return;
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint *pInp = pPairBuff + start;
uint num_changes = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
}
//----------------------------------------------------------------------------------------
BT3DGRID__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
{
int index = BT3DGRID__mul24(BT3DGRIDblockIdx.x, BT3DGRIDblockDim.x) + BT3DGRIDthreadIdx.x;
if(index >= (int)numBodies)
{
return;
}
btCuda3F1U bbMin = pAABB[index * 2];
uint handleIndex = bbMin.uw;
uint2 start_curr = pPairBuffStartCurr[handleIndex];
uint start = start_curr.x;
uint curr = start_curr.y;
uint* pInp = pPairBuff + start;
uint* pOut = pPairOut + pPairScan[index];
uint* pOut2 = pInp;
uint num = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
*pOut = *pInp;
pOut++;
}
if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
{
*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
pOut2++;
num++;
}
}
pPairBuffStartCurr[handleIndex] = BT3DGRIDmake_uint2(start, num);
} // squeezeOverlappingPairBuffD()
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
// E N D O F K E R N E L F U N C T I O N S
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
//----------------------------------------------------------------------------------------
extern "C"
{
//Round a / b to nearest higher integer value
int BT3DGRIDPREF(iDivUp)(int a, int b)
{
return (a % b != 0) ? (a / b + 1) : (a / b);
}
// compute grid and thread block size for a given number of elements
void BT3DGRIDPREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
{
numThreads = BT3DGRIDmin(blockSize, n);
numBlocks = BT3DGRIDPREF(iDivUp)(n, numThreads);
}
void BT3DGRIDPREF(calcHashAABB)(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
// execute the kernel
BT3DGRIDEXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
// check if kernel invocation generated an error
CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
}
void BT3DGRIDPREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
MY_CUDA_SAFE_CALL(BT3DGPRDMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
BT3DGRIDEXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
}
void BT3DGRIDPREF(findOverlappingPairs(btCuda3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies))
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
}
void BT3DGRIDPREF(findPairsLarge(btCuda3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
{
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
#endif
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
#if B_CUDA_USE_TEX
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
#endif
}
void BT3DGRIDPREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies))
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
}
void BT3DGRIDPREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies))
{
int numThreads, numBlocks;
BT3DGRIDPREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
BT3DGRIDEXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint*)pPairOut,pAABB,numBodies));
CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
} // btCuda_squeezeOverlappingPairBuff()
} // extern "C"