more CUDA work

2008-10-28 23:25:59 +00:00
parent f5e16847df
commit 309a12836d
12 changed files with 1325 additions and 357 deletions
--- a/Extras/CUDA/particles_kernel.cu
+++ b/Extras/CUDA/particles_kernel.cu
@@ -199,6 +199,35 @@ reorderDataAndFindCellStartD(uint2*  particleHash,  // particle id sorted by has

 }

+
+__global__ void
+findCellStartD(uint2*  particleHash,  // particle id sorted by hash
+			 uint*   cellStart)
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+
+    uint2 sortedData = particleHash[index];
+
+	// Load hash data into shared memory so that we can look 
+	// at neighboring particle's hash value without loading
+	// two hash values per thread
+	__shared__ uint sharedHash[257];
+	sharedHash[threadIdx.x+1] = sortedData.x;
+	if (index > 0 && threadIdx.x == 0)
+	{
+		// first thread in block must load neighbor particle hash
+		volatile uint2 prevData = particleHash[index-1];
+		sharedHash[0] = prevData.x;
+	}
+
+	__syncthreads();
+	if (index == 0 || sortedData.x != sharedHash[threadIdx.x])
+	{
+		cellStart[sortedData.x] = index;
+	}
+}
+
+
 // collide two spheres using DEM method
 __device__ float3 collideSpheres(float4 posA, float4 posB,
                                 float4 velA, float4 velB,
@@ -378,4 +407,217 @@ collideD(float4* newPos, float4* newVel,
 #endif
 }

+__device__
+uint cudaTestAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
+{
+	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && 
+			(min0.y <= max1.y)&& (min1.y <= max0.y) && 
+			(min0.z <= max1.z)&& (min1.z <= max0.z); 
+}
+
+#if 0
+__device__
+void findPairsInCell(	int3	gridPos,
+						uint    index,
+						uint2*  pParticleHash,
+						uint*   pCellStart,
+						float4* pAABB, 
+						uint*   pPairBuff,
+						uint2* pPairBuffStartCurr)
+{
+    if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
+        (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
+        (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
+        return;
+    }
+    uint gridHash = calcGridHash(gridPos);
+    // get start of bucket for this cell
+    uint bucketStart = pCellStart[gridHash];
+    if (bucketStart == 0xffffffff)
+        return;   // cell empty
+	// iterate over particles in this cell
+	float4 min0 = pAABB[index*2];
+	float4 max0 = pAABB[index*2 + 1];
+	uint2 start_curr = pPairBuffStartCurr[index];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+//	uint* pBuf = pPairBuff; // + curr;
+	for(uint q=0; q < params.maxParticlesPerCell; q++) 
+	{
+        uint index2 = bucketStart + q;
+        uint2 cellData = pParticleHash[index2];
+        if (cellData.x != gridHash) break;   // no longer in same bucket
+        if (index2 != index) // check not colliding with self
+        {   
+			float4 min1 = pAABB[index2*2];
+			float4 max1 = pAABB[index2*2 + 1];
+			if(cudaTestAABBOverlap(min0, max0, min1, max1))
+			{
+				curr++;
+			}
+			//*pBuf = index2;
+			//pPairBuff[curr] = index2;
+			//pPairBuff[0] = index2;
+			//pBuf++;
+		}
+	}
+//	uint del = curr - start;
+	if(curr != start)
+	{
+		curr = start;
+	}
+	pPairBuffStartCurr[index] = make_uint2(start, curr);
+
+//	
+    return;
+}
+#endif
+
+__device__
+void findPairsInCell(	int3	gridPos,
+						uint    index,
+						uint2*  pParticleHash,
+						uint*   pCellStart,
+						float4* pAABB, 
+						uint*   pPairBuff,
+						uint2* pPairBuffStartCurr,
+						uint numParticles)
+{
+    if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
+        (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
+        (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
+        return;
+    }
+    uint gridHash = calcGridHash(gridPos);
+    // get start of bucket for this cell
+    uint bucketStart = pCellStart[gridHash];
+    if (bucketStart == 0xffffffff)
+        return;   // cell empty
+	// iterate over particles in this cell
+    float4 min0 = pAABB[index*2];
+    float4 max0 = pAABB[index*2+1];
+    
+    uint2 sortedData = pParticleHash[index];
+	uint unsorted_indx = sortedData.y;
+	uint2 start_curr = pPairBuffStartCurr[unsorted_indx];
+//	uint2 start_curr = pPairBuffStartCurr[index];
+
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint bucketEnd = bucketStart + params.maxParticlesPerCell;
+	bucketEnd = (bucketEnd > numParticles) ? numParticles : bucketEnd;
+	for(uint index2=bucketStart; index2 < bucketEnd; index2++) 
+	{
+        uint2 cellData = pParticleHash[index2];
+        if (cellData.x != gridHash) break;   // no longer in same bucket
+        if (index2 < index) // check not colliding with self
+        {   
+			float4 min1 = pAABB[index2*2];
+			float4 max1 = pAABB[index2*2 + 1];
+			if(cudaTestAABBOverlap(min0, max0, min1, max1))
+			{
+				uint k;
+				uint unsorted_indx2 = cellData.y;
+				for(k = 0; k < curr; k++)
+				{
+					uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
+					if(old_pair == unsorted_indx2)
+					{
+						pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					pPairBuff[start+curr] = unsorted_indx2 | BT_CUDA_PAIR_NEW_FLG;
+					curr++;
+				}
+			}
+		}
+	}
+	pPairBuffStartCurr[unsorted_indx] = make_uint2(start, curr);
+//	pPairBuffStartCurr[index] = make_uint2(start, curr);
+    return;
+}
+
+
+
+__global__ void
+btCudaFindOverlappingPairsD(
+	float4* pAABB,
+	uint2* pParticleHash,
+	uint* pCellStart,
+	uint* pPairBuff,
+	uint2* pPairBuffStartCurr,
+	uint numParticles
+)
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+    float4 bbMin = pAABB[index*2];
+    float4 bbMax = pAABB[index*2+1];
+    float4 pos = (bbMin + bbMax) * 0.5f; 
+    // get address in grid
+    int3 gridPos = calcGridPos(pos);
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) {
+        for(int y=-1; y<=1; y++) {
+            for(int x=-1; x<=1; x++) {
+                findPairsInCell(gridPos + make_int3(x, y, z), index, pParticleHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numParticles);
+            }
+        }
+    }
+}
+
+
+
+__global__ void
+btCudaComputePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan)
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+	uint2 start_curr = pPairBuffStartCurr[index];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint *pInp = pPairBuff + start;
+	uint num_changes = 0;
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+}
+
+
+__global__ void
+btCudaSqueezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut)
+{
+    int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
+	uint2 start_curr = pPairBuffStartCurr[index];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint* pInp = pPairBuff + start;
+	uint* pOut = pPairOut + pPairScan[index];
+	uint* pOut2 = pInp;
+	uint num = 0; 
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
+		{
+			*pOut = *pInp;
+			pOut++;
+		}
+		if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
+		{
+			*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
+			pOut2++;
+			num++;
+		}
+	}
+	pPairBuffStartCurr[index] = make_uint2(start, num);
+} // btCudaBroadphase::squeezeOverlappingPairBuffCPU_D()
+
+
+
 #endif