more CUDA work

This commit is contained in:
rponom
2008-10-28 23:25:59 +00:00
parent f5e16847df
commit 309a12836d
12 changed files with 1325 additions and 357 deletions

View File

@@ -199,6 +199,35 @@ reorderDataAndFindCellStartD(uint2* particleHash, // particle id sorted by has
}
__global__ void
findCellStartD(uint2* particleHash, // particle id sorted by hash
uint* cellStart)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
uint2 sortedData = particleHash[index];
// Load hash data into shared memory so that we can look
// at neighboring particle's hash value without loading
// two hash values per thread
__shared__ uint sharedHash[257];
sharedHash[threadIdx.x+1] = sortedData.x;
if (index > 0 && threadIdx.x == 0)
{
// first thread in block must load neighbor particle hash
volatile uint2 prevData = particleHash[index-1];
sharedHash[0] = prevData.x;
}
__syncthreads();
if (index == 0 || sortedData.x != sharedHash[threadIdx.x])
{
cellStart[sortedData.x] = index;
}
}
// collide two spheres using DEM method
__device__ float3 collideSpheres(float4 posA, float4 posB,
float4 velA, float4 velB,
@@ -378,4 +407,217 @@ collideD(float4* newPos, float4* newVel,
#endif
}
__device__
uint cudaTestAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
{
return (min0.x <= max1.x)&& (min1.x <= max0.x) &&
(min0.y <= max1.y)&& (min1.y <= max0.y) &&
(min0.z <= max1.z)&& (min1.z <= max0.z);
}
#if 0
__device__
void findPairsInCell( int3 gridPos,
uint index,
uint2* pParticleHash,
uint* pCellStart,
float4* pAABB,
uint* pPairBuff,
uint2* pPairBuffStartCurr)
{
if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
(gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
(gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
return;
}
uint gridHash = calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = pCellStart[gridHash];
if (bucketStart == 0xffffffff)
return; // cell empty
// iterate over particles in this cell
float4 min0 = pAABB[index*2];
float4 max0 = pAABB[index*2 + 1];
uint2 start_curr = pPairBuffStartCurr[index];
uint start = start_curr.x;
uint curr = start_curr.y;
// uint* pBuf = pPairBuff; // + curr;
for(uint q=0; q < params.maxParticlesPerCell; q++)
{
uint index2 = bucketStart + q;
uint2 cellData = pParticleHash[index2];
if (cellData.x != gridHash) break; // no longer in same bucket
if (index2 != index) // check not colliding with self
{
float4 min1 = pAABB[index2*2];
float4 max1 = pAABB[index2*2 + 1];
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
curr++;
}
//*pBuf = index2;
//pPairBuff[curr] = index2;
//pPairBuff[0] = index2;
//pBuf++;
}
}
// uint del = curr - start;
if(curr != start)
{
curr = start;
}
pPairBuffStartCurr[index] = make_uint2(start, curr);
//
return;
}
#endif
__device__
void findPairsInCell( int3 gridPos,
uint index,
uint2* pParticleHash,
uint* pCellStart,
float4* pAABB,
uint* pPairBuff,
uint2* pPairBuffStartCurr,
uint numParticles)
{
if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) ||
(gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) ||
(gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) {
return;
}
uint gridHash = calcGridHash(gridPos);
// get start of bucket for this cell
uint bucketStart = pCellStart[gridHash];
if (bucketStart == 0xffffffff)
return; // cell empty
// iterate over particles in this cell
float4 min0 = pAABB[index*2];
float4 max0 = pAABB[index*2+1];
uint2 sortedData = pParticleHash[index];
uint unsorted_indx = sortedData.y;
uint2 start_curr = pPairBuffStartCurr[unsorted_indx];
// uint2 start_curr = pPairBuffStartCurr[index];
uint start = start_curr.x;
uint curr = start_curr.y;
uint bucketEnd = bucketStart + params.maxParticlesPerCell;
bucketEnd = (bucketEnd > numParticles) ? numParticles : bucketEnd;
for(uint index2=bucketStart; index2 < bucketEnd; index2++)
{
uint2 cellData = pParticleHash[index2];
if (cellData.x != gridHash) break; // no longer in same bucket
if (index2 < index) // check not colliding with self
{
float4 min1 = pAABB[index2*2];
float4 max1 = pAABB[index2*2 + 1];
if(cudaTestAABBOverlap(min0, max0, min1, max1))
{
uint k;
uint unsorted_indx2 = cellData.y;
for(k = 0; k < curr; k++)
{
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
if(old_pair == unsorted_indx2)
{
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
break;
}
}
if(k == curr)
{
pPairBuff[start+curr] = unsorted_indx2 | BT_CUDA_PAIR_NEW_FLG;
curr++;
}
}
}
}
pPairBuffStartCurr[unsorted_indx] = make_uint2(start, curr);
// pPairBuffStartCurr[index] = make_uint2(start, curr);
return;
}
__global__ void
btCudaFindOverlappingPairsD(
float4* pAABB,
uint2* pParticleHash,
uint* pCellStart,
uint* pPairBuff,
uint2* pPairBuffStartCurr,
uint numParticles
)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
float4 bbMin = pAABB[index*2];
float4 bbMax = pAABB[index*2+1];
float4 pos = (bbMin + bbMax) * 0.5f;
// get address in grid
int3 gridPos = calcGridPos(pos);
// examine only neighbouring cells
for(int z=-1; z<=1; z++) {
for(int y=-1; y<=1; y++) {
for(int x=-1; x<=1; x++) {
findPairsInCell(gridPos + make_int3(x, y, z), index, pParticleHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numParticles);
}
}
}
}
__global__ void
btCudaComputePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
uint2 start_curr = pPairBuffStartCurr[index];
uint start = start_curr.x;
uint curr = start_curr.y;
uint *pInp = pPairBuff + start;
uint num_changes = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
num_changes++;
}
}
pPairScan[index+1] = num_changes;
}
__global__ void
btCudaSqueezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut)
{
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
uint2 start_curr = pPairBuffStartCurr[index];
uint start = start_curr.x;
uint curr = start_curr.y;
uint* pInp = pPairBuff + start;
uint* pOut = pPairOut + pPairScan[index];
uint* pOut2 = pInp;
uint num = 0;
for(uint k = 0; k < curr; k++, pInp++)
{
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
{
*pOut = *pInp;
pOut++;
}
if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
{
*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
pOut2++;
num++;
}
}
pPairBuffStartCurr[index] = make_uint2(start, num);
} // btCudaBroadphase::squeezeOverlappingPairBuffCPU_D()
#endif