590 lines
21 KiB
Plaintext
590 lines
21 KiB
Plaintext
/*
|
|
* Copyright 1993-2006 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* NOTICE TO USER:
|
|
*
|
|
* This source code is subject to NVIDIA ownership rights under U.S. and
|
|
* international Copyright laws.
|
|
*
|
|
* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
|
|
* CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
|
|
* IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
|
|
* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
|
|
* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
|
|
* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
|
|
* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
|
|
* OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
|
|
* OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
|
|
* OR PERFORMANCE OF THIS SOURCE CODE.
|
|
*
|
|
* U.S. Government End Users. This source code is a "commercial item" as
|
|
* that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
|
|
* "commercial computer software" and "commercial computer software
|
|
* documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
|
|
* and is provided to the U.S. Government only as a commercial end item.
|
|
* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
|
|
* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
|
|
* source code with only those rights set forth herein.
|
|
*/
|
|
|
|
#include <cstdlib>
|
|
#include <cstdio>
|
|
#include <string.h>
|
|
|
|
#include "cutil_math.h"
|
|
#include "math_constants.h"
|
|
|
|
#if defined(__APPLE__) || defined(MACOSX)
|
|
#include <GLUT/glut.h>
|
|
#else
|
|
#include <GL/glut.h>
|
|
#endif
|
|
|
|
#include <cuda_gl_interop.h>
|
|
|
|
#include "btCudaBroadphaseKernel.h"
|
|
//#include "radixsort.cu"
|
|
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
// K E R N E L F U N C T I O N S
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
#ifdef __DEVICE_EMULATION__
|
|
#define B_CUDA_USE_TEX 0
|
|
#else
|
|
#define B_CUDA_USE_TEX 1
|
|
#endif
|
|
|
|
__device__ inline btCuda3F1U tex_fetch3F1U(float4 a) { return *((btCuda3F1U*)(&a)); }
|
|
|
|
#if B_CUDA_USE_TEX
|
|
#define FETCH(t, i) tex_fetch3F1U(tex1Dfetch(t##Tex, i))
|
|
#else
|
|
#define FETCH(t, i) t[i]
|
|
#endif
|
|
|
|
texture<uint2, 1, cudaReadModeElementType> particleHashTex;
|
|
texture<uint, 1, cudaReadModeElementType> cellStartTex;
|
|
texture<float4, 1, cudaReadModeElementType> pAABBTex;
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__constant__ btCudaBroadphaseParams params;
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
// calculate position in uniform grid
|
|
__device__ int3 btCuda_calcGridPos(float4 p)
|
|
{
|
|
int3 gridPos;
|
|
gridPos.x = floor((p.x - params.m_worldOriginX) / params.m_cellSizeX);
|
|
gridPos.y = floor((p.y - params.m_worldOriginY) / params.m_cellSizeY);
|
|
gridPos.z = floor((p.z - params.m_worldOriginZ) / params.m_cellSizeZ);
|
|
return gridPos;
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
// calculate address in grid from position (clamping to edges)
|
|
__device__ uint btCuda_calcGridHash(int3 gridPos)
|
|
{
|
|
gridPos.x = max(0, min(gridPos.x, params.m_gridSizeX - 1));
|
|
gridPos.y = max(0, min(gridPos.y, params.m_gridSizeY - 1));
|
|
gridPos.z = max(0, min(gridPos.z, params.m_gridSizeZ - 1));
|
|
return __mul24(__mul24(gridPos.z, params.m_gridSizeY), params.m_gridSizeX) + __mul24(gridPos.y, params.m_gridSizeX) + gridPos.x;
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
// calculate grid hash value for each body using its AABB
|
|
__global__ void calcHashAABBD(btCuda3F1U* pAABB, uint2* pHash, uint numBodies)
|
|
{
|
|
int index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
|
|
if(index >= numBodies)
|
|
{
|
|
return;
|
|
}
|
|
btCuda3F1U bbMin = pAABB[index*2];
|
|
btCuda3F1U bbMax = pAABB[index*2 + 1];
|
|
float4 pos;
|
|
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
|
|
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
|
|
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
|
|
// get address in grid
|
|
int3 gridPos = btCuda_calcGridPos(pos);
|
|
uint gridHash = btCuda_calcGridHash(gridPos);
|
|
// store grid hash and body index
|
|
pHash[index] = make_uint2(gridHash, index);
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
|
|
{
|
|
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
|
|
if(index >= numBodies)
|
|
{
|
|
return;
|
|
}
|
|
uint2 sortedData = pHash[index];
|
|
// Load hash data into shared memory so that we can look
|
|
// at neighboring body's hash value without loading
|
|
// two hash values per thread
|
|
__shared__ uint sharedHash[257];
|
|
sharedHash[threadIdx.x+1] = sortedData.x;
|
|
if((index > 0) && (threadIdx.x == 0))
|
|
{
|
|
// first thread in block must load neighbor body hash
|
|
volatile uint2 prevData = pHash[index-1];
|
|
sharedHash[0] = prevData.x;
|
|
}
|
|
__syncthreads();
|
|
if((index == 0) || (sortedData.x != sharedHash[threadIdx.x]))
|
|
{
|
|
cellStart[sortedData.x] = index;
|
|
}
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__device__ uint cudaTestAABBOverlap(btCuda3F1U min0, btCuda3F1U max0, btCuda3F1U min1, btCuda3F1U max1)
|
|
{
|
|
return (min0.fx <= max1.fx)&& (min1.fx <= max0.fx) &&
|
|
(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) &&
|
|
(min0.fz <= max1.fz)&& (min1.fz <= max0.fz);
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__device__ void findPairsInCell(int3 gridPos,
|
|
uint index,
|
|
uint2* pHash,
|
|
uint* pCellStart,
|
|
btCuda3F1U* pAABB,
|
|
uint* pPairBuff,
|
|
uint2* pPairBuffStartCurr,
|
|
uint numBodies)
|
|
{
|
|
if ( (gridPos.x < 0) || (gridPos.x > params.m_gridSizeX - 1)
|
|
|| (gridPos.y < 0) || (gridPos.y > params.m_gridSizeY - 1)
|
|
|| (gridPos.z < 0) || (gridPos.z > params.m_gridSizeZ - 1))
|
|
{
|
|
return;
|
|
}
|
|
uint gridHash = btCuda_calcGridHash(gridPos);
|
|
// get start of bucket for this cell
|
|
uint bucketStart = pCellStart[gridHash];
|
|
if (bucketStart == 0xffffffff)
|
|
{
|
|
return; // cell empty
|
|
}
|
|
// iterate over bodies in this cell
|
|
uint2 sortedData = pHash[index];
|
|
uint unsorted_indx = sortedData.y;
|
|
btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
|
|
btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
|
|
uint handleIndex = min0.uw;
|
|
uint2 start_curr = pPairBuffStartCurr[handleIndex];
|
|
uint start = start_curr.x;
|
|
uint curr = start_curr.y;
|
|
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
|
|
uint curr_max = start_curr_next.x - start - 1;
|
|
uint bucketEnd = bucketStart + params.m_maxBodiesPerCell;
|
|
bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
|
|
for(uint index2 = bucketStart; index2 < bucketEnd; index2++)
|
|
{
|
|
uint2 cellData = pHash[index2];
|
|
if (cellData.x != gridHash)
|
|
{
|
|
break; // no longer in same bucket
|
|
}
|
|
uint unsorted_indx2 = cellData.y;
|
|
if (unsorted_indx2 < unsorted_indx) // check not colliding with self
|
|
{
|
|
btCuda3F1U min1 = FETCH(pAABB, unsorted_indx2*2);
|
|
btCuda3F1U max1 = FETCH(pAABB, unsorted_indx2*2 + 1);
|
|
if(cudaTestAABBOverlap(min0, max0, min1, max1))
|
|
{
|
|
uint handleIndex2 = min1.uw;
|
|
uint k;
|
|
for(k = 0; k < curr; k++)
|
|
{
|
|
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
|
|
if(old_pair == handleIndex2)
|
|
{
|
|
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
|
|
break;
|
|
}
|
|
}
|
|
if(k == curr)
|
|
{
|
|
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
|
|
if(curr >= curr_max)
|
|
{ // not a good solution, but let's avoid crash
|
|
break;
|
|
}
|
|
curr++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
|
|
return;
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__global__ void
|
|
findOverlappingPairsD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
|
|
uint2* pPairBuffStartCurr, uint numBodies)
|
|
{
|
|
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
|
|
if(index >= numBodies)
|
|
{
|
|
return;
|
|
}
|
|
uint2 sortedData = pHash[index];
|
|
uint unsorted_indx = sortedData.y;
|
|
btCuda3F1U bbMin = FETCH(pAABB, unsorted_indx*2);
|
|
btCuda3F1U bbMax = FETCH(pAABB, unsorted_indx*2 + 1);
|
|
float4 pos;
|
|
pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
|
|
pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
|
|
pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
|
|
// get address in grid
|
|
int3 gridPos = btCuda_calcGridPos(pos);
|
|
// examine only neighbouring cells
|
|
for(int z=-1; z<=1; z++) {
|
|
for(int y=-1; y<=1; y++) {
|
|
for(int x=-1; x<=1; x++) {
|
|
findPairsInCell(gridPos + make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__global__ void
|
|
findPairsLargeD( btCuda3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff,
|
|
uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
|
|
{
|
|
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
|
|
if(index >= numBodies)
|
|
{
|
|
return;
|
|
}
|
|
uint2 sortedData = pHash[index];
|
|
uint unsorted_indx = sortedData.y;
|
|
btCuda3F1U min0 = FETCH(pAABB, unsorted_indx*2);
|
|
btCuda3F1U max0 = FETCH(pAABB, unsorted_indx*2 + 1);
|
|
uint handleIndex = min0.uw;
|
|
uint2 start_curr = pPairBuffStartCurr[handleIndex];
|
|
uint start = start_curr.x;
|
|
uint curr = start_curr.y;
|
|
uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
|
|
uint curr_max = start_curr_next.x - start - 1;
|
|
for(uint i = 0; i < numLarge; i++)
|
|
{
|
|
uint indx2 = numBodies + i;
|
|
btCuda3F1U min1 = FETCH(pAABB, indx2*2);
|
|
btCuda3F1U max1 = FETCH(pAABB, indx2*2 + 1);
|
|
if(cudaTestAABBOverlap(min0, max0, min1, max1))
|
|
{
|
|
uint k;
|
|
uint handleIndex2 = min1.uw;
|
|
for(k = 0; k < curr; k++)
|
|
{
|
|
uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG);
|
|
if(old_pair == handleIndex2)
|
|
{
|
|
pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG;
|
|
break;
|
|
}
|
|
}
|
|
if(k == curr)
|
|
{
|
|
pPairBuff[start+curr] = handleIndex2 | BT_CUDA_PAIR_NEW_FLG;
|
|
if(curr >= curr_max)
|
|
{ // not a good solution, but let's avoid crash
|
|
break;
|
|
}
|
|
curr++;
|
|
}
|
|
}
|
|
}
|
|
pPairBuffStartCurr[handleIndex] = make_uint2(start, curr);
|
|
return;
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, btCuda3F1U* pAABB, uint numBodies)
|
|
{
|
|
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
|
|
if(index >= numBodies)
|
|
{
|
|
return;
|
|
}
|
|
btCuda3F1U bbMin = pAABB[index * 2];
|
|
uint handleIndex = bbMin.uw;
|
|
uint2 start_curr = pPairBuffStartCurr[handleIndex];
|
|
uint start = start_curr.x;
|
|
uint curr = start_curr.y;
|
|
uint *pInp = pPairBuff + start;
|
|
uint num_changes = 0;
|
|
for(uint k = 0; k < curr; k++, pInp++)
|
|
{
|
|
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
|
|
{
|
|
num_changes++;
|
|
}
|
|
}
|
|
pPairScan[index+1] = num_changes;
|
|
}
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
__global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, btCuda3F1U* pAABB, uint numBodies)
|
|
{
|
|
int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
|
|
if(index >= numBodies)
|
|
{
|
|
return;
|
|
}
|
|
btCuda3F1U bbMin = pAABB[index * 2];
|
|
uint handleIndex = bbMin.uw;
|
|
uint2 start_curr = pPairBuffStartCurr[handleIndex];
|
|
uint start = start_curr.x;
|
|
uint curr = start_curr.y;
|
|
uint* pInp = pPairBuff + start;
|
|
uint* pOut = pPairOut + pPairScan[index];
|
|
uint* pOut2 = pInp;
|
|
uint num = 0;
|
|
for(uint k = 0; k < curr; k++, pInp++)
|
|
{
|
|
if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG))
|
|
{
|
|
*pOut = *pInp;
|
|
pOut++;
|
|
}
|
|
if((*pInp) & BT_CUDA_PAIR_ANY_FLG)
|
|
{
|
|
*pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG);
|
|
pOut2++;
|
|
num++;
|
|
}
|
|
}
|
|
pPairBuffStartCurr[handleIndex] = make_uint2(start, num);
|
|
} // squeezeOverlappingPairBuffD()
|
|
|
|
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
// E N D O F K E R N E L F U N C T I O N S
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
//----------------------------------------------------------------------------------------
|
|
|
|
|
|
//! Check for CUDA error
|
|
# define CUT_CHECK_ERROR(errorMessage) do { \
|
|
cudaError_t err = cudaGetLastError(); \
|
|
if( cudaSuccess != err) { \
|
|
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
|
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
|
btCuda_exit(EXIT_FAILURE); \
|
|
} \
|
|
err = cudaThreadSynchronize(); \
|
|
if( cudaSuccess != err) { \
|
|
fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \
|
|
errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
|
|
btCuda_exit(EXIT_FAILURE); \
|
|
} } while (0)
|
|
|
|
|
|
# define MY_CUDA_SAFE_CALL_NO_SYNC( call) do { \
|
|
cudaError err = call; \
|
|
if( cudaSuccess != err) { \
|
|
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
|
|
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
|
btCuda_exit(EXIT_FAILURE); \
|
|
} } while (0)
|
|
|
|
# define MY_CUDA_SAFE_CALL( call) do { \
|
|
MY_CUDA_SAFE_CALL_NO_SYNC(call); \
|
|
cudaError err = cudaThreadSynchronize(); \
|
|
if( cudaSuccess != err) { \
|
|
fprintf(stderr, "Cuda errorSync in file '%s' in line %i : %s.\n", \
|
|
__FILE__, __LINE__, cudaGetErrorString( err) ); \
|
|
btCuda_exit(EXIT_FAILURE); \
|
|
} } while (0)
|
|
|
|
|
|
extern "C"
|
|
{
|
|
|
|
void btCuda_exit(int val)
|
|
{
|
|
exit(val);
|
|
}
|
|
|
|
void btCuda_allocateArray(void** devPtr, unsigned int size)
|
|
{
|
|
MY_CUDA_SAFE_CALL(cudaMalloc(devPtr, size));
|
|
}
|
|
|
|
void btCuda_freeArray(void* devPtr)
|
|
{
|
|
MY_CUDA_SAFE_CALL(cudaFree(devPtr));
|
|
}
|
|
|
|
void btCuda_copyArrayFromDevice(void* host, const void* device, unsigned int size)
|
|
{
|
|
MY_CUDA_SAFE_CALL(cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost));
|
|
}
|
|
|
|
void btCuda_copyArrayToDevice(void* device, const void* host, unsigned int size)
|
|
{
|
|
MY_CUDA_SAFE_CALL(cudaMemcpy((char*)device, host, size, cudaMemcpyHostToDevice));
|
|
}
|
|
|
|
void btCuda_setParameters(btCudaBroadphaseParams* hostParams)
|
|
{
|
|
// copy parameters to constant memory
|
|
MY_CUDA_SAFE_CALL(cudaMemcpyToSymbol(params, hostParams, sizeof(btCudaBroadphaseParams)));
|
|
}
|
|
|
|
//Round a / b to nearest higher integer value
|
|
int btCuda_iDivUp(int a, int b)
|
|
{
|
|
return (a % b != 0) ? (a / b + 1) : (a / b);
|
|
}
|
|
|
|
// compute grid and thread block size for a given number of elements
|
|
void btCuda_computeGridSize(int n, int blockSize, int &numBlocks, int &numThreads)
|
|
{
|
|
numThreads = min(blockSize, n);
|
|
numBlocks = btCuda_iDivUp(n, numThreads);
|
|
}
|
|
|
|
void btCuda_calcHashAABB(btCuda3F1U* pAABB, unsigned int* hash, unsigned int numBodies)
|
|
{
|
|
int numThreads, numBlocks;
|
|
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
|
|
// execute the kernel
|
|
calcHashAABBD<<< numBlocks, numThreads >>>(pAABB, (uint2*)hash, numBodies);
|
|
// check if kernel invocation generated an error
|
|
CUT_CHECK_ERROR("calcHashAABBD kernel execution failed");
|
|
}
|
|
|
|
void btCuda_findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells)
|
|
{
|
|
int numThreads, numBlocks;
|
|
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
|
|
MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint)));
|
|
findCellStartD<<< numBlocks, numThreads >>>((uint2*)hash, (uint*)cellStart, numBodies);
|
|
CUT_CHECK_ERROR("Kernel execution failed: findCellStartD");
|
|
}
|
|
|
|
void btCuda_findOverlappingPairs( btCuda3F1U* pAABB, unsigned int* pHash,
|
|
unsigned int* pCellStart,
|
|
unsigned int* pPairBuff,
|
|
unsigned int* pPairBuffStartCurr,
|
|
unsigned int numBodies)
|
|
{
|
|
#if B_CUDA_USE_TEX
|
|
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(btCuda3F1U)));
|
|
#endif
|
|
int numThreads, numBlocks;
|
|
btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
|
|
findOverlappingPairsD<<< numBlocks, numThreads >>>(
|
|
pAABB,
|
|
(uint2*)pHash,
|
|
(uint*)pCellStart,
|
|
(uint*)pPairBuff,
|
|
(uint2*)pPairBuffStartCurr,
|
|
numBodies
|
|
);
|
|
CUT_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
|
|
#if B_CUDA_USE_TEX
|
|
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
|
|
#endif
|
|
} // btCuda_findOverlappingPairs()
|
|
|
|
|
|
|
|
void btCuda_findPairsLarge( btCuda3F1U* pAABB, unsigned int* pHash,
|
|
unsigned int* pCellStart,
|
|
unsigned int* pPairBuff,
|
|
unsigned int* pPairBuffStartCurr,
|
|
unsigned int numBodies,
|
|
unsigned int numLarge)
|
|
{
|
|
#if B_CUDA_USE_TEX
|
|
MY_CUDA_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(btCuda3F1U)));
|
|
#endif
|
|
int numThreads, numBlocks;
|
|
btCuda_computeGridSize(numBodies, 64, numBlocks, numThreads);
|
|
findPairsLargeD<<< numBlocks, numThreads >>>(
|
|
pAABB,
|
|
(uint2*)pHash,
|
|
(uint*)pCellStart,
|
|
(uint*)pPairBuff,
|
|
(uint2*)pPairBuffStartCurr,
|
|
numBodies,
|
|
numLarge
|
|
);
|
|
CUT_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
|
|
#if B_CUDA_USE_TEX
|
|
MY_CUDA_SAFE_CALL(cudaUnbindTexture(pAABBTex));
|
|
#endif
|
|
} // btCuda_findPairsLarge()
|
|
|
|
|
|
|
|
void btCuda_computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr,
|
|
unsigned int* pPairScan, btCuda3F1U* pAABB, unsigned int numBodies)
|
|
{
|
|
int numThreads, numBlocks;
|
|
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
|
|
computePairCacheChangesD<<< numBlocks, numThreads >>>(
|
|
(uint*)pPairBuff,
|
|
(uint2*)pPairBuffStartCurr,
|
|
(uint*)pPairScan,
|
|
pAABB,
|
|
numBodies
|
|
);
|
|
CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
|
|
} // btCuda_computePairCacheChanges()
|
|
|
|
|
|
void btCuda_squeezeOverlappingPairBuff( unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan,
|
|
unsigned int* pPairOut, btCuda3F1U* pAABB, unsigned int numBodies)
|
|
{
|
|
int numThreads, numBlocks;
|
|
btCuda_computeGridSize(numBodies, 256, numBlocks, numThreads);
|
|
squeezeOverlappingPairBuffD<<< numBlocks, numThreads >>>(
|
|
(uint*)pPairBuff,
|
|
(uint2*)pPairBuffStartCurr,
|
|
(uint*)pPairScan,
|
|
(uint*)pPairOut,
|
|
pAABB,
|
|
numBodies
|
|
);
|
|
CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
|
|
} // btCuda_squeezeOverlappingPairBuff()
|
|
|
|
|
|
} // extern "C"
|