CPU implementation of btCudaBroadphase added.

It is called bt3DGridBroadphase and btCudaBroadphase is now derived from it rater than from btSimpleBroadphase Test of bt3DGridBroadphase was added to CDTestFramework
2008-11-25 03:16:11 +00:00
parent 7dda192bfc
commit 09aa2dbbe7
10 changed files with 1247 additions and 839 deletions
--- a/Extras/CUDA/btCudaBroadphase.cu
+++ b/Extras/CUDA/btCudaBroadphase.cu
@@ -65,9 +65,9 @@
 __device__ inline btCuda3F1U tex_fetch3F1U(float4 a) { return *((btCuda3F1U*)(&a)); }

 #if B_CUDA_USE_TEX
-	#define FETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
+	#define BT3DGRIDFETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
 #else
-	#define FETCH(t, i) t[i]
+	#define BT3DGRIDFETCH(t, i) t[i]
 #endif

 texture<uint2, 1, cudaReadModeElementType> particleHashTex;
@@ -80,323 +80,25 @@ __constant__ btCudaBroadphaseParams params;

 //----------------------------------------------------------------------------------------

-// calculate position in uniform grid
-__device__ int3 btCuda_calcGridPos(float4 p)
-{
-    int3 gridPos;
-    gridPos.x = floor((p.x - params.m_worldOriginX) / params.m_cellSizeX);
-    gridPos.y = floor((p.y - params.m_worldOriginY) / params.m_cellSizeY);
-    gridPos.z = floor((p.z - params.m_worldOriginZ) / params.m_cellSizeZ);
-    return gridPos;
-}
+#define BT3DGRID__device__ __device__
+#define BT3DGRIDmax(a, b) max(a, b)
+#define BT3DGRIDmin(a, b) min(a, b)
+#define BT3DGRIDparams params
+#define BT3DGRID__mul24(a, b) __mul24(a, b)
+#define BT3DGRID__global__ __global__
+#define BT3DGRID__shared__ __shared__
+#define BT3DGRID__syncthreads() __syncthreads()
+#define BT3DGRIDmake_uint2(x, y) make_uint2(x, y)
+#define BT3DGRIDmake_int3(x, y, z) make_int3(x, y, z)
+#define BT3DGRIDPREF(func) btCuda_##func
+#define BT3DGPRDMemset cudaMemset
+#define BT3DGRIDblockIdx blockIdx
+#define BT3DGRIDblockDim blockDim
+#define BT3DGRIDthreadIdx threadIdx
+#define BT3DGRIDEXECKERNEL(numb, numt, kfunc, args) kfunc<<<numb, numt>>>args

 //----------------------------------------------------------------------------------------

-// calculate address in grid from position (clamping to edges)
-__device__ uint btCuda_calcGridHash(int3 gridPos)
-{
-    gridPos.x = max(0, min(gridPos.x, params.m_gridSizeX - 1));
-    gridPos.y = max(0, min(gridPos.y, params.m_gridSizeY - 1));
-    gridPos.z = max(0, min(gridPos.z, params.m_gridSizeZ - 1));
-    return __mul24(__mul24(gridPos.z, params.m_gridSizeY), params.m_gridSizeX) + __mul24(gridPos.y, params.m_gridSizeX) + gridPos.x;
-}
-
-//----------------------------------------------------------------------------------------
-
-// calculate grid hash value for each body using its AABB
-__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
-{
-    int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-	btCuda3F1U bbMin = pAABB[index*2];
-	btCuda3F1U bbMax = pAABB[index*2 + 1];
-	float4 pos;
-	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
-	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
-	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
-    // get address in grid
-    int3 gridPos = btCuda_calcGridPos(pos);
-    uint gridHash = btCuda_calcGridHash(gridPos);
-    // store grid hash and body index
-    pHash[index] = make_uint2(gridHash, index);
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	// Load hash data into shared memory so that we can look 
-	// at neighboring body's hash value without loading
-	// two hash values per thread
-	__shared__ uint sharedHash[257];
-	sharedHash[threadIdx.x+1] = sortedData.x;
-	if((index > 0) && (threadIdx.x == 0))
-	{
-		// first thread in block must load neighbor body hash
-		volatile uint2 prevData = pHash[index-1];
-		sharedHash[0] = prevData.x;
-	}
-	__syncthreads();
-	if((index == 0) || (sortedData.x != sharedHash[threadIdx.x]))
-	{
-		cellStart[sortedData.x] = index;
-	}
-}
-
-//----------------------------------------------------------------------------------------
-
-__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
-{
-	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
-			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
-			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
-}
-
-//----------------------------------------------------------------------------------------
-
-__device__ void findPairsInCell(int3	gridPos,
-								uint    index,
-								uint2*  pHash,
-								uint*   pCellStart,
-								btCuda3F1U* pAABB, 
-								uint*   pPairBuff,
-								uint2*	pPairBuffStartCurr,
-								uint	numBodies)
-{
-    if (	(gridPos.x < 0) || (gridPos.x > params.m_gridSizeX - 1)
-		||	(gridPos.y < 0) || (gridPos.y > params.m_gridSizeY - 1)
-		||  (gridPos.z < 0) || (gridPos.z > params.m_gridSizeZ - 1)) 
-    {
-		return;
-	}
-    uint gridHash = btCuda_calcGridHash(gridPos);
-    // get start of bucket for this cell
-    uint bucketStart = pCellStart[gridHash];
-    if (bucketStart == 0xffffffff)
-	{
-        return;   // cell empty
-	}
-	// iterate over bodies in this cell
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-    btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2); 
-	btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
-	uint handleIndex =  min0.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
-	uint curr_max = start_curr_next.x - start - 1;
-	uint bucketEnd = bucketStart + params.m_maxBodiesPerCell;
-	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
-	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
-	{
-        uint2 cellData = pHash[index2];
-        if (cellData.x != gridHash)
-        {
-			break;   // no longer in same bucket
-		}
-		uint unsorted_indx2 = cellData.y;
-        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
-        {   
-			btCuda3F1U min1 = FETCH(pAABB, unsorted_indx2*2);
-			btCuda3F1U max1 = FETCH(pAABB, unsorted_indx2*2 + 1);
-			if(cudaTestAABBOverlap(min0, max0, min1, max1))
-			{
-				uint handleIndex2 = min1.uw;
-				uint k;
-				for(k = 0; k < curr; k++)
-				{
-					uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
-					if(old_pair == handleIndex2)
-					{
-						pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
-						break;
-					}
-				}
-				if(k == curr)
-				{
-					pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
-					if(curr >= curr_max) 
-					{ // not a good solution, but let's avoid crash
-						break;
-					}
-					curr++;
-				}
-			}
-		}
-	}
-	pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
-    return;
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void
-findOverlappingPairsD(	btCuda3F1U*	pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
-						uint2* pPairBuffStartCurr, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-	btCuda3F1U bbMin = FETCH(pAABB, unsorted_indx*2);
-	btCuda3F1U bbMax = FETCH(pAABB, unsorted_indx*2 + 1);
-	float4 pos;
-	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
-	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
-	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
-    // get address in grid
-    int3 gridPos = btCuda_calcGridPos(pos);
-    // examine only neighbouring cells
-    for(int z=-1; z<=1; z++) {
-        for(int y=-1; y<=1; y++) {
-            for(int x=-1; x<=1; x++) {
-                findPairsInCell(gridPos + make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
-            }
-        }
-    }
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void
-findPairsLargeD(	btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
-						uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-    uint2 sortedData = pHash[index];
-	uint unsorted_indx = sortedData.y;
-	btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
-	btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
-	uint handleIndex =  min0.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
-	uint curr_max = start_curr_next.x - start - 1;
-    for(uint i = 0; i < numLarge; i++)
-    {
-		uint indx2 = numBodies + i;
-		btCuda3F1U min1 = FETCH(pAABB, indx2*2);
-		btCuda3F1U max1 = FETCH(pAABB, indx2*2 + 1);
-		if(cudaTestAABBOverlap(min0, max0, min1, max1))
-		{
-			uint k;
-			uint handleIndex2 =  min1.uw;
-			for(k = 0; k < curr; k++)
-			{
-				uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
-				if(old_pair == handleIndex2)
-				{
-					pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
-					break;
-				}
-			}
-			if(k == curr)
-			{
-				pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
-				if(curr >= curr_max) 
-				{ // not a good solution, but let's avoid crash
-					break;
-				}
-				curr++;
-			}
-		}
-    }
-	pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
-    return;
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-	btCuda3F1U bbMin = pAABB[index * 2];
-	uint handleIndex = bbMin.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint *pInp = pPairBuff + start;
-	uint num_changes = 0;
-	for(uint k = 0; k < curr; k++, pInp++)
-	{
-		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
-		{
-			num_changes++;
-		}
-	}
-	pPairScan[index+1] = num_changes;
-}
-
-//----------------------------------------------------------------------------------------
-
-__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
-{
-    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
-    if(index >= numBodies)
-	{
-		return;
-	}
-	btCuda3F1U bbMin = pAABB[index * 2];
-	uint handleIndex = bbMin.uw;
-	uint2 start_curr = pPairBuffStartCurr[handleIndex];
-	uint start = start_curr.x;
-	uint curr = start_curr.y;
-	uint* pInp = pPairBuff + start;
-	uint* pOut = pPairOut + pPairScan[index];
-	uint* pOut2 = pInp;
-	uint num = 0; 
-	for(uint k = 0; k < curr; k++, pInp++)
-	{
-		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
-		{
-			*pOut = *pInp;
-			pOut++;
-		}
-		if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
-		{
-			*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
-			pOut2++;
-			num++;
-		}
-	}
-	pPairBuffStartCurr[handleIndex] = make_uint2(start, num);
-} // squeezeOverlappingPairBuffD()
-
-
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//               E N D   O F    K E R N E L    F U N C T I O N S 
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-//----------------------------------------------------------------------------------------
-
-
 //! Check for CUDA error
 #  define CUT_CHECK_ERROR(errorMessage) do {                                 \
    cudaError_t err = cudaGetLastError();                                    \
@@ -430,9 +132,7 @@ __global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffSta
        btCuda_exit(EXIT_FAILURE);                                           \
    } } while (0)

-
-extern "C"
-{
+//----------------------------------------------------------------------------------------

 void btCuda_exit(int val)
 {
@@ -465,125 +165,9 @@ void btCuda_setParameters(btCudaBroadphaseParams* hostParams)
    MY_CUDA_SAFE_CALL(cudaMemcpyToSymbol(params, hostParams, sizeof(btCudaBroadphaseParams)));
 }

-//Round a / b to nearest higher integer value
-int btCuda_iDivUp(int a, int b)
-{
-    return (a % b != 0) ? (a / b + 1) : (a / b);
-}
+//----------------------------------------------------------------------------------------

-// compute grid and thread block size for a given number of elements
-void btCuda_computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
-{
-    numThreads = min(blockSize, n);
-    numBlocks = btCuda_iDivUp(n, numThreads);
-}
+#include "bt3DGridBroadphaseFunc.h"

-void btCuda_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-    // execute the kernel
-    calcHashAABBD<<< numBlocks, numThreads >>>(pAABB, (uint2*)hash, numBodies);
-    // check if kernel invocation generated an error
-    CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
-}
+//----------------------------------------------------------------------------------------

-void btCuda_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-	MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
-    findCellStartD<<< numBlocks, numThreads >>>((uint2*)hash, (uint*)cellStart, numBodies);
-    CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
-}
-
-void btCuda_findOverlappingPairs(	btCuda3F1U*		pAABB, unsigned int* pHash,
-									unsigned int*	pCellStart,
-									unsigned int*	pPairBuff,
-									unsigned int*	pPairBuffStartCurr,
-									unsigned int	numBodies)
-{
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
-#endif
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
-    findOverlappingPairsD<<< numBlocks, numThreads >>>(
-		pAABB,
-		(uint2*)pHash,
-        (uint*)pCellStart,
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-		numBodies
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
-#endif
- } // btCuda_findOverlappingPairs()
-
-
-
-void btCuda_findPairsLarge(	btCuda3F1U*	pAABB, unsigned int* pHash,
-							unsigned int*	pCellStart,
-							unsigned int*	pPairBuff,
-							unsigned int*	pPairBuffStartCurr,
-							unsigned int	numBodies,
-							unsigned int	numLarge)
-{
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
-#endif
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
-    findPairsLargeD<<< numBlocks, numThreads >>>(
-		pAABB,
-		(uint2*)pHash,
-        (uint*)pCellStart,
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-		numBodies,
-		numLarge
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
-#if B_CUDA_USE_TEX
-    MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
-#endif
- } // btCuda_findPairsLarge()
-
-
-
-void btCuda_computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, 
-									unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-    computePairCacheChangesD<<< numBlocks, numThreads >>>(
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-        (uint*)pPairScan,
-        pAABB, 
-        numBodies
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
- } // btCuda_computePairCacheChanges()
-
-
-void btCuda_squeezeOverlappingPairBuff(	unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, 
-										unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies)
-{
-    int numThreads, numBlocks;
-    btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
-    squeezeOverlappingPairBuffD<<< numBlocks, numThreads >>>(
-		(uint*)pPairBuff,
-		(uint2*)pPairBuffStartCurr,
-        (uint*)pPairScan,
-        (uint*)pPairOut,
-        pAABB, 
-        numBodies
-	);
-    CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
-} // btCuda_squeezeOverlappingPairBuff()
-
-
-}   // extern "C"