diff --git a/Extras/CUDA/btCudaBroadphase.cpp b/Extras/CUDA/btCudaBroadphase.cpp index 7b06d305a..25a715083 100644 --- a/Extras/CUDA/btCudaBroadphase.cpp +++ b/Extras/CUDA/btCudaBroadphase.cpp @@ -34,17 +34,21 @@ subject to the following restrictions: #include #endif - +#define MAX_COLL_PAIR_PER_PARTICLE 64 #define USE_SORT 1 +#define USE_OLD 0 +#define USE_CUDA 1 #include "btCudaBroadphase.h" #include "LinearMath/btAlignedAllocator.h" +#include "LinearMath/btQuickprof.h" #include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h" btCudaBroadphase::btCudaBroadphase(SimParams& simParams,int maxProxies) : btSimpleBroadphase(maxProxies, - new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache), +// new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache), + new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache), m_bInitialized(false), m_numParticles(simParams.numBodies), m_hPos(0), @@ -155,6 +159,26 @@ void btCudaBroadphase::_initialize(int numParticles) m_hCellStart = new uint[m_simParams.numCells]; memset(m_hCellStart, 0, m_simParams.numCells*sizeof(uint)); + + m_hPairBuffStartCurr = new unsigned int[m_numParticles * 2 + 1]; + // --------------- for now, init with MAX_COLL_PAIR_PER_PARTICLE for each particle + m_hPairBuffStartCurr[0] = 0; + m_hPairBuffStartCurr[1] = 0; + for(uint i = 1; i <= m_numParticles; i++) + { + m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + MAX_COLL_PAIR_PER_PARTICLE; +// m_hPairBuffStartCurr[i * 2 + 1] = m_hPairBuffStartCurr[i * 2]; + m_hPairBuffStartCurr[i * 2 + 1] = 0; + } + //---------------- + m_hAABB = new float[numParticles*4*2]; // BB Min & Max + + m_hPairBuff = new unsigned int[m_numParticles * MAX_COLL_PAIR_PER_PARTICLE]; + memset(m_hPairBuff, 0x00, m_numParticles*MAX_COLL_PAIR_PER_PARTICLE*4); + + m_hPairScan = new unsigned int[m_numParticles + 1]; + m_hPairOut = new unsigned int[m_numParticles * MAX_COLL_PAIR_PER_PARTICLE]; + // allocate GPU data unsigned int memSize = sizeof(float) * 4 * m_numParticles; @@ -176,7 +200,18 @@ void btCudaBroadphase::_initialize(int numParticles) allocateArray((void**)&m_dGridCells, m_numGridCells*m_maxParticlesPerCell*sizeof(uint)); #endif - m_colorVBO = createVBO(m_numParticles*4*sizeof(float)); + allocateArray((void**)&m_dPairBuff, m_numParticles*MAX_COLL_PAIR_PER_PARTICLE*sizeof(unsigned int)); + copyArrayToDevice(m_dPairBuff, m_hPairBuff, 0, sizeof(unsigned int)*m_numParticles*MAX_COLL_PAIR_PER_PARTICLE); + + allocateArray((void**)&m_dPairBuffStartCurr, (m_numParticles*2 + 1)*sizeof(unsigned int)); + allocateArray((void**)&m_dAABB, memSize*2); + + copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, 0, sizeof(unsigned int)*(m_numParticles*2 + 1)); + + allocateArray((void**)&m_dPairScan, (m_numParticles + 1)*sizeof(unsigned int)); + allocateArray((void**)&m_dPairOut, m_numParticles*MAX_COLL_PAIR_PER_PARTICLE*sizeof(unsigned int)); + + m_colorVBO = createVBO(m_numParticles*4*sizeof(float)); #if 1 // fill color buffer @@ -201,6 +236,10 @@ void btCudaBroadphase::_initialize(int numParticles) setParameters(&m_simParams); +// Pair cache data + m_maxPairsPerParticle = 0; + m_numOverflows = 0; + m_bInitialized = true; } @@ -217,6 +256,14 @@ void btCudaBroadphase::_finalize() delete [] m_hGridCounters; delete [] m_hGridCells; + delete [] m_dPairBuff; + delete [] m_dPairBuffStartCurr; + delete [] m_hAABB; + + delete [] m_hPairBuff; + delete [] m_hPairScan; + delete [] m_hPairOut; + freeArray(m_dVel[0]); freeArray(m_dVel[1]); @@ -231,12 +278,20 @@ void btCudaBroadphase::_finalize() freeArray(m_dGridCounters); freeArray(m_dGridCells); #endif + freeArray(m_dPairBuff); + freeArray(m_dPairBuffStartCurr); + freeArray(m_dAABB); + + freeArray(m_hPairBuff); + freeArray(m_hPairScan); + freeArray(m_hPairOut); unregisterGLBufferObject(m_posVbo[0]); unregisterGLBufferObject(m_posVbo[1]); glDeleteBuffers(2, (const GLuint*)m_posVbo); glDeleteBuffers(1, (const GLuint*)&m_colorVBO); + } btCudaBroadphase::~btCudaBroadphase() @@ -300,9 +355,6 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) static int frameCount = 0; //printf("framecount=%d\n",frameCount++); - - int numRejected=0; - if (m_numHandles >= 0) { @@ -373,9 +425,10 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) // sort and search method // calculate hash - calcHash(m_posVbo[m_currentPosRead], - m_dParticleHash[0], - m_numParticles); + { + BT_PROFILE("calcHash-- CUDA"); + calcHash( m_posVbo[m_currentPosRead], m_dParticleHash[0], m_numParticles); + } #if DEBUG_GRID copyArrayFromDevice((void *) m_hParticleHash, (void *) m_dParticleHash[0], 0, sizeof(uint)*2*m_numParticles); @@ -388,7 +441,10 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) // sort particles based on hash - RadixSort((KeyValuePair *) m_dParticleHash[0], (KeyValuePair *) m_dParticleHash[1], m_numParticles, 32); + { + BT_PROFILE("RadixSort-- CUDA"); + RadixSort((KeyValuePair *) m_dParticleHash[0], (KeyValuePair *) m_dParticleHash[1], m_numParticles, 32); + } #if DEBUG_GRID copyArrayFromDevice((void *) m_hParticleHash, (void *) m_dParticleHash[0], 0, sizeof(uint)*2*m_numParticles); @@ -401,14 +457,24 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) // reorder particle arrays into sorted order and // find start of each cell - reorderDataAndFindCellStart(m_dParticleHash[0], - m_posVbo[m_currentPosRead], - m_dVel[m_currentVelRead], - m_dSortedPos, - m_dSortedVel, - m_dCellStart, - m_numParticles, - m_simParams.numCells); + { + BT_PROFILE("Reorder-- CUDA"); +#if USE_OLD + reorderDataAndFindCellStart(m_dParticleHash[0], + m_posVbo[m_currentPosRead], + m_dVel[m_currentVelRead], + m_dSortedPos, + m_dSortedVel, + m_dCellStart, + m_numParticles, + m_simParams.numCells); +#else + findCellStart(m_dParticleHash[0], + m_dCellStart, + m_numParticles, + m_simParams.numCells); +#endif + } //#define DEBUG_GRID2 #ifdef DEBUG_GRID2 @@ -455,9 +521,10 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) } */ - copyArrayFromDevice((void *) m_hParticleHash, (void *) m_dParticleHash[0], 0, sizeof(uint)*2*m_numParticles); - copyArrayFromDevice((void *) m_hCellStart, (void *) m_dCellStart, 0, sizeof(uint)*m_simParams.numCells); - copyArrayFromDevice((void *) m_hSortedPos, (void*) m_dSortedPos,0 , sizeof(float)*4*m_numParticles); + copyArrayFromDevice((void *) m_hParticleHash, (void *) m_dParticleHash[0], 0, sizeof(uint)*2*m_numParticles); + copyArrayFromDevice((void *) m_hCellStart, (void *) m_dCellStart, 0, sizeof(uint)*m_simParams.numCells); + +// copyArrayFromDevice((void *) m_hSortedPos, (void*) m_dSortedPos,0 , sizeof(float)*4*m_numParticles); //#define DEBUG_INDICES 1 #ifdef DEBUG_INDICES @@ -482,6 +549,7 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) // } } +#if USE_OLD //printf("particle hash sorted:\n"); for(uint pi=0; pim_min+proxy0->m_max)*0.5f; + btVector3 mypos = (proxy0->m_aabbMin + proxy0->m_aabbMax)*0.5f; // float4* p = (float4*)&m_hSortedPos[index*4]; @@ -504,6 +572,7 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) particleGridPos.y = floor((mypos.y() - m_simParams.worldOrigin.y) / m_simParams.cellSize.y); particleGridPos.z = floor((mypos.z() - m_simParams.worldOrigin.z) / m_simParams.cellSize.z); + int numRejected=0; //for(int z=0; z<1; z++) for(int z=-1; z<=1; z++) @@ -575,15 +644,19 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) } } - - +#else // USE_OLD + btBroadphasePairArray& overlappingPairArrayA = m_pairCache->getOverlappingPairArray(); + findOverlappingPairs(dispatcher); +#endif #endif //_USE_BRUTEFORCE_N +#if USE_OLD ///if this broadphase is used in a btMultiSapBroadphase, we shouldn't sort the overlapping paircache if (m_ownsPairCache && m_pairCache->hasDeferredRemoval()) { - + BT_PROFILE("Cleaning-- CPU"); + btBroadphasePairArray& overlappingPairArray = m_pairCache->getOverlappingPairArray(); //perform a sort, to find duplicates and to sort 'invalid' pairs to the end @@ -622,6 +695,7 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) needsRemoval = false;//callback->processOverlap(pair); } else { + bool hasOverlapA = testAabbOverlap(pair.m_pProxy0,pair.m_pProxy1); needsRemoval = true; } } else @@ -629,7 +703,7 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) //remove duplicate needsRemoval = true; //should have no algorithm - btAssert(!pair.m_algorithm); +// btAssert(!pair.m_algorithm); } if (needsRemoval) @@ -661,6 +735,7 @@ void btCudaBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher) #endif//CLEAN_INVALID_PAIRS } +#endif // USE_OLD } //printf("numRejected=%d\n",numRejected); @@ -1137,3 +1212,333 @@ void btCudaBroadphase::quickHack2() } + + + +void btCudaBroadphase::findOverlappingPairs(btDispatcher* dispatcher) +{ + BT_PROFILE("findOverlappingPairs -- CPU"); + int numRejected=0; + m_numPairsAdded = 0; + + { + BT_PROFILE("copy AABB -- CPU"); + + // do it faster ? + float* pVec = m_hAABB; + for(uint pi=0; pim_aabbMin.getX(); + *pVec++ = proxy0->m_aabbMin.getY(); + *pVec++ = proxy0->m_aabbMin.getZ(); + *pVec++ = 0.0F; + *pVec++ = proxy0->m_aabbMax.getX(); + *pVec++ = proxy0->m_aabbMax.getY(); + *pVec++ = proxy0->m_aabbMax.getZ(); + *pVec++ = 0.0F; + } + } + +#if USE_CUDA +{ + + { + BT_PROFILE("CopyBB to CUDA"); + copyArrayToDevice(m_dAABB, m_hAABB, 0, sizeof(float)*4*2*m_numParticles); + } + { + BT_PROFILE("btCudaFindOverlappingPairs"); + btCudaFindOverlappingPairs( m_dAABB, + m_dParticleHash[0], + m_dCellStart, + m_dPairBuff, + m_dPairBuffStartCurr, + m_numParticles + ); + } + { + BT_PROFILE("btCudaComputePairCacheChanges"); + btCudaComputePairCacheChanges(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_numParticles); + } + { + BT_PROFILE("scanOverlappingPairBuffCPU"); + copyArrayFromDevice(m_hPairScan, m_dPairScan, 0, sizeof(unsigned int)*(m_numParticles + 1)); + scanOverlappingPairBuffCPU(); + copyArrayToDevice(m_dPairScan, m_hPairScan, 0, sizeof(unsigned int)*(m_numParticles + 1)); + } + { + BT_PROFILE("btCudaSqueezeOverlappingPairBuff"); + btCudaSqueezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScan, m_dPairOut, m_numParticles); + } + { + BT_PROFILE("btCudaSqueezeOverlappingPairBuff"); + copyArrayFromDevice(m_hPairOut, m_dPairOut, 0, sizeof(unsigned int) * m_hPairScan[m_numParticles]); + } + +} +#else + findOverlappingPairsCPU( m_hAABB, + m_hParticleHash, + m_hCellStart, + m_hPairBuff, + m_hPairBuffStartCurr, + m_numParticles); + computePairCacheChangesCPU(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_numParticles); + scanOverlappingPairBuffCPU(); + squeezeOverlappingPairBuffCPU(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScan, m_hPairOut, m_numParticles); +#endif + { + BT_PROFILE("addPairsToCache"); + addPairsToCacheCPU(dispatcher); + } +} // btCudaBroadphase::fillOverlappingPairCache() + + + +// calculate position in uniform grid +int3 btCudaBroadphase::calcGridPosCPU(float4 p) +{ + int3 gridPos; + gridPos.x = floor((p.x - m_simParams.worldOrigin.x) / m_simParams.cellSize.x); + gridPos.y = floor((p.y - m_simParams.worldOrigin.y) / m_simParams.cellSize.y); + gridPos.z = floor((p.z - m_simParams.worldOrigin.z) / m_simParams.cellSize.z); + return gridPos; +} // btCudaBroadphase::calcGridPos() + +// calculate address in grid from position (clamping to edges) +uint btCudaBroadphase::calcGridHashCPU(int3 gridPos) +{ + gridPos.x = max(0, min(gridPos.x, m_simParams.gridSize.x-1)); + gridPos.y = max(0, min(gridPos.y, m_simParams.gridSize.y-1)); + gridPos.z = max(0, min(gridPos.z, m_simParams.gridSize.z-1)); + return (gridPos.z * m_simParams.gridSize.y) * m_simParams.gridSize.x + gridPos.y * m_simParams.gridSize.x + gridPos.x; +} + +void btCudaBroadphase::computePairCacheChangesCPU(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint numParticles) +{ + for(uint i = 0; i < numParticles; i++) + { + computePairCacheChangesCPU_D(i, pPairBuff, (uint2*)pPairBuffStartCurr, pPairScan); + } +} + +void btCudaBroadphase::computePairCacheChangesCPU_D(uint index, uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan) +{ + uint2 start_curr = pPairBuffStartCurr[index]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint *pInp = pPairBuff + start; + uint num_changes = 0; + for(uint k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) + { + num_changes++; + } + } + pPairScan[index+1] = num_changes; +} + +void btCudaBroadphase::findOverlappingPairsCPU( float* pAABB, + uint* pParticleHash, + uint* pCellStart, + uint* pPairBuff, + uint* pPairBuffStartCurr, + uint numParticles) +{ + BT_PROFILE("findOverlappingPairsCPU -- CPU"); + for(uint i = 0; i < numParticles; i++) + { + findOverlappingPairsCPU_D( + i, + (float4 *)pAABB, + (uint2*)pParticleHash, + (uint*)pCellStart, + (uint*)pPairBuff, + (uint2*)pPairBuffStartCurr, + numParticles); + } +} // btCudaBroadphase::findOverlappingPairsCPU() + +void btCudaBroadphase::findOverlappingPairsCPU_D( uint index, + float4* pAABB, + uint2* pParticleHash, + uint* pCellStart, + uint* pPairBuff, + uint2* pPairBuffStartCurr, + uint numParticles) +{ + float4 bbMin = pAABB[index*2]; + float4 bbMax = pAABB[index*2+1]; + float4 pos; + pos.x = (bbMin.x + bbMax.x) * 0.5f; + pos.y = (bbMin.y + bbMax.y) * 0.5f; + pos.z = (bbMin.z + bbMax.z) * 0.5f; + + // get address in grid + int3 gridPos = calcGridPosCPU(pos); + // examine only neighbouring cells + for(int z=-1; z<=1; z++) { + for(int y=-1; y<=1; y++) { + for(int x=-1; x<=1; x++) { + int3 gridPos2; + gridPos2.x = gridPos.x + x; + gridPos2.y = gridPos.y + y; + gridPos2.z = gridPos.z + z; + findPairsInCellCPU(gridPos2, index, pParticleHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numParticles); + } + } + } +} // btCudaBroadphase::findOverlappingPairsCPU_D() + + +void btCudaBroadphase::findPairsInCellCPU( int3 gridPos, + uint index, + uint2* pParticleHash, + uint* pCellStart, + float4* pAABB, + uint* pPairBuff, + uint2* pPairBuffStartCurr, + uint numParticles) +{ + if ((gridPos.x < 0) || (gridPos.x > m_simParams.gridSize.x-1) || + (gridPos.y < 0) || (gridPos.y > m_simParams.gridSize.y-1) || + (gridPos.z < 0) || (gridPos.z > m_simParams.gridSize.z-1)) { + return; + } + uint gridHash = calcGridHashCPU(gridPos); + // get start of bucket for this cell + uint bucketStart = pCellStart[gridHash]; + if (bucketStart == 0xffffffff) + return; // cell empty + // iterate over particles in this cell + float4 min0 = pAABB[index*2]; + float4 max0 = pAABB[index*2+1]; + + uint2 sortedData = pParticleHash[index]; + uint unsorted_indx = sortedData.y; + uint2 start_curr = pPairBuffStartCurr[unsorted_indx]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint curr1 = curr; + uint bucketEnd = bucketStart + m_simParams.maxParticlesPerCell; + bucketEnd = (bucketEnd > numParticles) ? numParticles : bucketEnd; + for(uint index2=bucketStart; index2 < bucketEnd; index2++) + { + uint2 cellData = pParticleHash[index2]; + if (cellData.x != gridHash) break; // no longer in same bucket + if (index2 != index) // check not colliding with self + { + float4 min1 = pAABB[index2*2]; + float4 max1 = pAABB[index2*2 + 1]; + if(cudaTestAABBOverlapCPU(min0, max0, min1, max1)) + { + uint k; + uint unsorted_indx2 = cellData.y; + for(k = 0; k < curr1; k++) + { + uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG); + if(old_pair == unsorted_indx2) + { + pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG; + break; + } + } + if(k == curr1) + { + pPairBuff[start+curr] = unsorted_indx2 | BT_CUDA_PAIR_NEW_FLG; + curr++; + } + } + } + } + pPairBuffStartCurr[unsorted_indx] = make_uint2(start, curr); + return; +} // btCudaBroadphase::findPairsInCellCPU() + +uint btCudaBroadphase::cudaTestAABBOverlapCPU(float4 min0, float4 max0, float4 min1, float4 max1) +{ + return (min0.x <= max1.x)&& (min1.x <= max0.x) && + (min0.y <= max1.y)&& (min1.y <= max0.y) && + (min0.z <= max1.z)&& (min1.z <= max0.z); +} // btCudaBroadphase::cudaTestAABBOverlapCPU() + + +void btCudaBroadphase::scanOverlappingPairBuffCPU() +{ + m_hPairScan[0] = 0; + for(uint i = 1; i <= m_numParticles; i++) + { + unsigned int delta = m_hPairScan[i]; + m_hPairScan[i] = m_hPairScan[i-1] + delta; + } +} // btCudaBroadphase::scanOverlappingPairBuffCPU() + +void btCudaBroadphase::squeezeOverlappingPairBuffCPU(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, uint numParticles) +{ + for(uint i = 0; i < numParticles; i++) + { + squeezeOverlappingPairBuffCPU_D(i, pPairBuff, (uint2*)pPairBuffStartCurr, pPairScan, pPairOut); + } +} // btCudaBroadphase::squeezeOverlappingPairBuffCPU() + +void btCudaBroadphase::squeezeOverlappingPairBuffCPU_D(uint index, uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut) +{ + uint2 start_curr = pPairBuffStartCurr[index]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint* pInp = pPairBuff + start; + uint* pOut = pPairOut + pPairScan[index]; + uint* pOut2 = pInp; + uint num = 0; + for(uint k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) + { + *pOut = *pInp; + pOut++; + } + if((*pInp) & BT_CUDA_PAIR_ANY_FLG) + { + *pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG); + pOut2++; + num++; + } + } + pPairBuffStartCurr[index] = make_uint2(start, num); +} // btCudaBroadphase::squeezeOverlappingPairBuffCPU_D() + +unsigned int gNumPairsAdded = 0; + +void btCudaBroadphase::addPairsToCacheCPU(btDispatcher* dispatcher) +{ + gNumPairsAdded = 0; + for(uint i = 0; i < m_numParticles; i++) + { + unsigned int num = m_hPairScan[i+1] - m_hPairScan[i]; + if(!num) + { + continue; + } + unsigned int* pInp = m_hPairOut + m_hPairScan[i]; + unsigned int index0 = i; + btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0]; + for(uint j = 0; j < num; j++) + { + unsigned int indx1_s = pInp[j]; + unsigned int index1 = indx1_s & (~BT_CUDA_PAIR_ANY_FLG); + btSimpleBroadphaseProxy* proxy1 = &m_pHandles[index1]; + if(indx1_s & BT_CUDA_PAIR_NEW_FLG) + { + m_pairCache->addOverlappingPair(proxy0,proxy1); + gNumPairsAdded++; + } + else + { + m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher); + } + } + } +} // btCudaBroadphase::addPairsToCacheCPU() diff --git a/Extras/CUDA/btCudaBroadphase.h b/Extras/CUDA/btCudaBroadphase.h index 4c5ce80bc..f03a1ea25 100644 --- a/Extras/CUDA/btCudaBroadphase.h +++ b/Extras/CUDA/btCudaBroadphase.h @@ -37,7 +37,13 @@ class btCudaBroadphase : public btSimpleBroadphase unsigned int* m_hParticleHash; unsigned int* m_hCellStart; + + unsigned int* m_hPairBuffStartCurr; + float* m_hAABB; + unsigned int* m_hPairBuff; + unsigned int* m_hPairScan; + unsigned int* m_hPairOut; // GPU data float* m_dPos[2]; @@ -59,6 +65,14 @@ class btCudaBroadphase : public btSimpleBroadphase unsigned int m_currentPosRead, m_currentVelRead; unsigned int m_currentPosWrite, m_currentVelWrite; + // buffers on GPU + unsigned int* m_dPairBuff; + unsigned int* m_dPairBuffStartCurr; + float* m_dAABB; + + unsigned int* m_dPairScan; + unsigned int* m_dPairOut; + // params struct SimParams& m_simParams; @@ -66,6 +80,11 @@ class btCudaBroadphase : public btSimpleBroadphase unsigned int m_maxParticlesPerCell; +// debug + unsigned int m_numPairsAdded; + unsigned int m_maxPairsPerParticle; + unsigned int m_numOverflows; + protected: unsigned int createVBO(unsigned int size); @@ -120,5 +139,48 @@ public: void quickHack2(); void integrate(); + + void findOverlappingPairs(btDispatcher* dispatcher); + + int3 calcGridPosCPU(float4 p); + uint calcGridHashCPU(int3 gridPos); + + void computePairCacheChangesCPU(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint numParticles); + void computePairCacheChangesCPU_D(uint index, uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan); + + void findOverlappingPairsCPU( float* pAABB, + uint* pParticleHash, + uint* pCellStart, + uint* pPairBuff, + uint* pPairBuffStartCurr, + uint numParticles); + void findOverlappingPairsCPU_D( uint index, + float4* pAABB, + uint2* pParticleHash, + uint* pCellStart, + uint* pPairBuff, + uint2* pPairBuffStartCurr, + uint numParticles); + +void findPairsInCellCPU(int3 gridPos, + uint index, + uint2* pParticleHash, + uint* pCellStart, + float4* pAABB, + uint* pPairBuff, + uint2* pPairBuffStartCurr, + uint numParticles); +uint cudaTestAABBOverlapCPU(float4 min0, float4 max0, float4 min1, float4 max1); + + + void scanOverlappingPairBuffCPU(); + + void squeezeOverlappingPairBuffCPU(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, uint numParticles); + void squeezeOverlappingPairBuffCPU_D(uint index, uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut); + + void addPairsToCacheCPU(btDispatcher* dispatcher); + void resetOverlappingPairBuffCPU(); + + }; #endif //CUDA_BROADPHASE_H \ No newline at end of file diff --git a/Extras/CUDA/btCudaBroadphase.vcproj b/Extras/CUDA/btCudaBroadphase.vcproj index 9ecbdcbcd..4e18a89ca 100644 --- a/Extras/CUDA/btCudaBroadphase.vcproj +++ b/Extras/CUDA/btCudaBroadphase.vcproj @@ -44,7 +44,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - + + + + + + + + + - - - - - - - - - @@ -838,6 +808,16 @@ Outputs="$(ConfigurationName)\particleSystem_cu.obj" /> + + + @@ -848,6 +828,16 @@ Outputs="$(ConfigurationName)\particleSystem_cu.obj" /> + + + @@ -858,6 +848,16 @@ Outputs="$(ConfigurationName)\particleSystem_cu.obj" /> + + + @@ -899,6 +899,16 @@ Outputs="" /> + + + - - - + + @@ -1019,10 +1023,6 @@ RelativePath=".\paramgl.h" > - - diff --git a/Extras/CUDA/particleSystem.cpp b/Extras/CUDA/particleSystem.cpp index 8e6704cf6..d508b80a8 100644 --- a/Extras/CUDA/particleSystem.cpp +++ b/Extras/CUDA/particleSystem.cpp @@ -72,7 +72,7 @@ void ParticleSystem::initializeBullet() m_collisionConfiguration = new btDefaultCollisionConfiguration(); m_dispatcher = new btCollisionDispatcher(m_collisionConfiguration); - //m_broadphase = new btDbvtBroadphase(); +// m_broadphase = new btDbvtBroadphase(); //m_broadphase = new btAxisSweep3(btVector3(-3,-3,-3),btVector3(3,3,3)); m_broadphase = new btCudaBroadphase(m_params,m_params.numBodies+6); @@ -202,9 +202,9 @@ ParticleSystem::update(float deltaTime) hPosData[i*4+1] = trans.getOrigin().getY(); hPosData[i*4+2] = trans.getOrigin().getZ(); - m_hVel[i*4] = m_bulletParticles[i]->getLinearVelocity().getX()/10.; - m_hVel[i*4+1] = m_bulletParticles[i]->getLinearVelocity().getY()/10.; - m_hVel[i*4+2] = m_bulletParticles[i]->getLinearVelocity().getZ()/10.; + m_hVel[i*4] = m_bulletParticles[i]->getLinearVelocity().getX()/10.f; + m_hVel[i*4+1] = m_bulletParticles[i]->getLinearVelocity().getY()/10.f; + m_hVel[i*4+2] = m_bulletParticles[i]->getLinearVelocity().getZ()/10.f; } m_broadphase->copyBuffersFromHostToDevice(); diff --git a/Extras/CUDA/particleSystem.cu b/Extras/CUDA/particleSystem.cu index ce8117e86..319937572 100644 --- a/Extras/CUDA/particleSystem.cu +++ b/Extras/CUDA/particleSystem.cu @@ -49,13 +49,13 @@ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ + mm_exit(EXIT_FAILURE); \ } \ err = cudaThreadSynchronize(); \ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ - exit(EXIT_FAILURE); \ + mm_exit(EXIT_FAILURE); \ } } while (0) @@ -64,22 +64,27 @@ if( cudaSuccess != err) { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ + mm_exit(EXIT_FAILURE); \ } } while (0) # define MY_CUDA_SAFE_CALL( call) do { \ MY_CUDA_SAFE_CALL_NO_SYNC(call); \ cudaError err = cudaThreadSynchronize(); \ if( cudaSuccess != err) { \ - fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + fprintf(stderr, "Cuda errorSync in file '%s' in line %i : %s.\n", \ __FILE__, __LINE__, cudaGetErrorString( err) ); \ - exit(EXIT_FAILURE); \ + mm_exit(EXIT_FAILURE); \ } } while (0) extern "C" { +void mm_exit(int val) +{ + exit(val); +} + void cudaInit(int argc, char **argv) { //CUT_DEVICE_INIT(argc, argv); @@ -112,6 +117,26 @@ void copyArrayFromDevice(void* host, const void* device, unsigned int vbo, int s void copyArrayToDevice(void* device, const void* host, int offset, int size) { MY_CUDA_SAFE_CALL(cudaMemcpy((char *) device + offset, host, size, cudaMemcpyHostToDevice)); +/* + cudaError_t err = cudaMemcpy((char *) device + offset, host, size, cudaMemcpyHostToDevice); + switch(err) + { + case cudaSuccess : + return; + case cudaErrorInvalidValue : + printf("\ncudaErrorInvalidValue : %d\n", err); + return; + case cudaErrorInvalidDevicePointer : + printf("\ncudaErrorInvalidDevicePointer : %d\n", err); + return; + case cudaErrorInvalidMemcpyDirection : + printf("\ncudaErrorInvalidMemcpyDirection : %d\n", err); + return; + default : + printf("\nX3 : %d\n", err); + return; + } +*/ } void registerGLBufferObject(uint vbo) @@ -255,6 +280,27 @@ reorderDataAndFindCellStart(uint* particleHash, MY_CUDA_SAFE_CALL(cudaGLUnmapBufferObject(vboOldPos)); } +#if 1 +void +findCellStart( uint* particleHash, + uint* cellStart, + uint numBodies, + uint numCells) +{ + int numThreads, numBlocks; + computeGridSize(numBodies, 256, numBlocks, numThreads); + + MY_CUDA_SAFE_CALL(cudaMemset(cellStart, 0xffffffff, numCells*sizeof(uint))); + + findCellStartD<<< numBlocks, numThreads >>>( + (uint2 *) particleHash, + (uint *) cellStart); + CUT_CHECK_ERROR("Kernel execution failed: findCellStartD"); + +} +#endif + + void collide(uint vboOldPos, uint vboNewPos, float* sortedPos, float* sortedVel, @@ -328,4 +374,67 @@ collide(uint vboOldPos, uint vboNewPos, #endif } +void +btCudaFindOverlappingPairs( float* pAABB, + uint* pParticleHash, + uint* pCellStart, + uint* pPairBuff, + uint* pPairBuffStartCurr, + uint numParticles) +{ + +// cudaError err = cudaMemset(pPairBuff, 0x00, numParticles*32*4); +// if(err != cudaSuccess) +// { +// printf("\nAAAAA\n"); +// } + + int numThreads, numBlocks; +// computeGridSize(numParticles, 256, numBlocks, numThreads); + computeGridSize(numParticles, 64, numBlocks, numThreads); +// numThreads = 1; +// numBlocks = 1; + btCudaFindOverlappingPairsD<<< numBlocks, numThreads >>>( + (float4 *)pAABB, + (uint2*)pParticleHash, + (uint*)pCellStart, + (uint*)pPairBuff, + (uint2*)pPairBuffStartCurr, + numParticles + ); + CUT_CHECK_ERROR("Kernel execution failed: btCudaFindOverlappingPairsD"); + } // btCudaFindOverlappingPairs() + + +void +btCudaComputePairCacheChanges(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint numParticles) +{ + int numThreads, numBlocks; + computeGridSize(numParticles, 256, numBlocks, numThreads); + + btCudaComputePairCacheChangesD<<< numBlocks, numThreads >>>( + (uint*)pPairBuff, + (uint2*)pPairBuffStartCurr, + (uint*)pPairScan + ); + CUT_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD"); + } // btCudaFindOverlappingPairs() + + +void +btCudaSqueezeOverlappingPairBuff(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, uint numParticles) +{ + int numThreads, numBlocks; + computeGridSize(numParticles, 256, numBlocks, numThreads); + + btCudaSqueezeOverlappingPairBuffD<<< numBlocks, numThreads >>>( + (uint*)pPairBuff, + (uint2*)pPairBuffStartCurr, + (uint*)pPairScan, + pPairOut + ); + CUT_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD"); +} + + } // extern "C" diff --git a/Extras/CUDA/particleSystem.cuh b/Extras/CUDA/particleSystem.cuh index e2b297fc6..e58535f0f 100644 --- a/Extras/CUDA/particleSystem.cuh +++ b/Extras/CUDA/particleSystem.cuh @@ -42,6 +42,12 @@ reorderDataAndFindCellStart(uint* particleHash, uint numBodies, uint numCells); +void +findCellStart( uint* particleHash, + uint* cellStart, + uint numBodies, + uint numCells); + void collide(uint vboOldPos, uint vboNewPos, float* sortedPos, float* sortedVel, @@ -53,5 +59,20 @@ collide(uint vboOldPos, uint vboNewPos, uint numBodies, uint numCells, uint maxParticlesPerCell); + +void +btCudaFindOverlappingPairs( float* pAABB, + uint* pParticleHash, + uint* pCellStart, + uint* pPairBuff, + uint* pPairBuffStartCurr, + uint numParticles); + +void +btCudaComputePairCacheChanges(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint numParticles); + + +void btCudaSqueezeOverlappingPairBuff(uint* pPairBuff, uint* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut, uint numParticles); + } diff --git a/Extras/CUDA/particleSystem.h b/Extras/CUDA/particleSystem.h index 3abf09078..0ac75e6b7 100644 --- a/Extras/CUDA/particleSystem.h +++ b/Extras/CUDA/particleSystem.h @@ -79,6 +79,7 @@ public: class btDefaultCollisionConfiguration* m_collisionConfiguration; class btCollisionDispatcher* m_dispatcher; class btCudaBroadphase* m_broadphase; +// class btBroadphaseInterface* m_broadphase; class btSequentialImpulseConstraintSolver* m_constraintSolver; btAlignedObjectArray m_bulletParticles; diff --git a/Extras/CUDA/particles.cpp b/Extras/CUDA/particles.cpp index cd5ad40fd..9ebb19efe 100644 --- a/Extras/CUDA/particles.cpp +++ b/Extras/CUDA/particles.cpp @@ -45,6 +45,9 @@ #include #endif + +#include "LinearMath/btQuickprof.h" + #include "particleSystem.h" #include "render_particles.h" #include "paramgl.h" @@ -165,7 +168,9 @@ void display() glColor3f(1.0, 1.0, 1.0); glutWireCube(2.0); - // collider + + + // collider glPushMatrix(); float4 p = psystem->getColliderPos(); glTranslatef(p.x, p.y, p.z); @@ -188,6 +193,19 @@ void display() } psystem->debugDraw(); + + glDisable(GL_DEPTH_TEST); +// glDisable(GL_LIGHTING); +// glColor3f(0, 0, 0); + float offsX = 10.f; + float offsY = 10.f; + renderer->showProfileInfo(offsX, offsY, 20.f); +// glEnable(GL_LIGHTING); + glEnable(GL_DEPTH_TEST); + + + + glutSwapBuffers(); @@ -357,6 +375,19 @@ inline float frand() // commented out to remove unused parameter warnings in Linux void key(unsigned char key, int /*x*/, int /*y*/) { + +#ifndef BT_NO_PROFILE + if (key >= 0x31 && key < 0x37) + { + int child = key-0x31; + renderer->m_profileIterator->Enter_Child(child); + } + if (key==0x30) + { + renderer->m_profileIterator->Enter_Parent(); + } +#endif //BT_NO_PROFILE + switch (key) { case ' ': @@ -394,13 +425,13 @@ void key(unsigned char key, int /*x*/, int /*y*/) displayEnabled = !displayEnabled; break; - case '1': + case 'g': psystem->reset(ParticleSystem::CONFIG_GRID); break; - case '2': + case 'a': psystem->reset(ParticleSystem::CONFIG_RANDOM); break; - case '3': + case 'e': { // inject a sphere of particles float pr = psystem->getParticleRadius(); @@ -414,7 +445,7 @@ void key(unsigned char key, int /*x*/, int /*y*/) psystem->addSphere(0, pos, vel, ballr, pr*2.0f); } break; - case '4': + case 'b': { // shoot ball from camera float pr = psystem->getParticleRadius(); @@ -484,9 +515,10 @@ void mainMenu(int i) void initMenus() { glutCreateMenu(mainMenu); - glutAddMenuEntry("Reset block [1]", '1'); - glutAddMenuEntry("Reset random [2]", '2'); - glutAddMenuEntry("Add sphere [3]", '3'); + glutAddMenuEntry("Reset block [g]", 'g'); + glutAddMenuEntry("Reset random [a]", 'a'); + glutAddMenuEntry("Add sphere [e]", 'e'); + glutAddMenuEntry("Shoot ball [b]", 'b'); glutAddMenuEntry("View mode [v]", 'v'); glutAddMenuEntry("Move cursor mode [m]", 'm'); glutAddMenuEntry("Toggle point rendering [p]", 'p'); @@ -504,7 +536,8 @@ void initMenus() int main(int argc, char** argv) { - numParticles =1024;//1024;//64;//16380;//32768; +// numParticles =1024;//1024;//64;//16380;//32768; + numParticles =8192; uint gridDim = 64; numIterations = 0; diff --git a/Extras/CUDA/particles_kernel.cu b/Extras/CUDA/particles_kernel.cu index 9817cb384..1673ffcb8 100644 --- a/Extras/CUDA/particles_kernel.cu +++ b/Extras/CUDA/particles_kernel.cu @@ -199,6 +199,35 @@ reorderDataAndFindCellStartD(uint2* particleHash, // particle id sorted by has } + +__global__ void +findCellStartD(uint2* particleHash, // particle id sorted by hash + uint* cellStart) +{ + int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; + + uint2 sortedData = particleHash[index]; + + // Load hash data into shared memory so that we can look + // at neighboring particle's hash value without loading + // two hash values per thread + __shared__ uint sharedHash[257]; + sharedHash[threadIdx.x+1] = sortedData.x; + if (index > 0 && threadIdx.x == 0) + { + // first thread in block must load neighbor particle hash + volatile uint2 prevData = particleHash[index-1]; + sharedHash[0] = prevData.x; + } + + __syncthreads(); + if (index == 0 || sortedData.x != sharedHash[threadIdx.x]) + { + cellStart[sortedData.x] = index; + } +} + + // collide two spheres using DEM method __device__ float3 collideSpheres(float4 posA, float4 posB, float4 velA, float4 velB, @@ -378,4 +407,217 @@ collideD(float4* newPos, float4* newVel, #endif } +__device__ +uint cudaTestAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1) +{ + return (min0.x <= max1.x)&& (min1.x <= max0.x) && + (min0.y <= max1.y)&& (min1.y <= max0.y) && + (min0.z <= max1.z)&& (min1.z <= max0.z); +} + +#if 0 +__device__ +void findPairsInCell( int3 gridPos, + uint index, + uint2* pParticleHash, + uint* pCellStart, + float4* pAABB, + uint* pPairBuff, + uint2* pPairBuffStartCurr) +{ + if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) || + (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) || + (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) { + return; + } + uint gridHash = calcGridHash(gridPos); + // get start of bucket for this cell + uint bucketStart = pCellStart[gridHash]; + if (bucketStart == 0xffffffff) + return; // cell empty + // iterate over particles in this cell + float4 min0 = pAABB[index*2]; + float4 max0 = pAABB[index*2 + 1]; + uint2 start_curr = pPairBuffStartCurr[index]; + uint start = start_curr.x; + uint curr = start_curr.y; +// uint* pBuf = pPairBuff; // + curr; + for(uint q=0; q < params.maxParticlesPerCell; q++) + { + uint index2 = bucketStart + q; + uint2 cellData = pParticleHash[index2]; + if (cellData.x != gridHash) break; // no longer in same bucket + if (index2 != index) // check not colliding with self + { + float4 min1 = pAABB[index2*2]; + float4 max1 = pAABB[index2*2 + 1]; + if(cudaTestAABBOverlap(min0, max0, min1, max1)) + { + curr++; + } + //*pBuf = index2; + //pPairBuff[curr] = index2; + //pPairBuff[0] = index2; + //pBuf++; + } + } +// uint del = curr - start; + if(curr != start) + { + curr = start; + } + pPairBuffStartCurr[index] = make_uint2(start, curr); + +// + return; +} +#endif + +__device__ +void findPairsInCell( int3 gridPos, + uint index, + uint2* pParticleHash, + uint* pCellStart, + float4* pAABB, + uint* pPairBuff, + uint2* pPairBuffStartCurr, + uint numParticles) +{ + if ((gridPos.x < 0) || (gridPos.x > params.gridSize.x-1) || + (gridPos.y < 0) || (gridPos.y > params.gridSize.y-1) || + (gridPos.z < 0) || (gridPos.z > params.gridSize.z-1)) { + return; + } + uint gridHash = calcGridHash(gridPos); + // get start of bucket for this cell + uint bucketStart = pCellStart[gridHash]; + if (bucketStart == 0xffffffff) + return; // cell empty + // iterate over particles in this cell + float4 min0 = pAABB[index*2]; + float4 max0 = pAABB[index*2+1]; + + uint2 sortedData = pParticleHash[index]; + uint unsorted_indx = sortedData.y; + uint2 start_curr = pPairBuffStartCurr[unsorted_indx]; +// uint2 start_curr = pPairBuffStartCurr[index]; + + uint start = start_curr.x; + uint curr = start_curr.y; + uint bucketEnd = bucketStart + params.maxParticlesPerCell; + bucketEnd = (bucketEnd > numParticles) ? numParticles : bucketEnd; + for(uint index2=bucketStart; index2 < bucketEnd; index2++) + { + uint2 cellData = pParticleHash[index2]; + if (cellData.x != gridHash) break; // no longer in same bucket + if (index2 < index) // check not colliding with self + { + float4 min1 = pAABB[index2*2]; + float4 max1 = pAABB[index2*2 + 1]; + if(cudaTestAABBOverlap(min0, max0, min1, max1)) + { + uint k; + uint unsorted_indx2 = cellData.y; + for(k = 0; k < curr; k++) + { + uint old_pair = pPairBuff[start+k] & (~BT_CUDA_PAIR_ANY_FLG); + if(old_pair == unsorted_indx2) + { + pPairBuff[start+k] |= BT_CUDA_PAIR_FOUND_FLG; + break; + } + } + if(k == curr) + { + pPairBuff[start+curr] = unsorted_indx2 | BT_CUDA_PAIR_NEW_FLG; + curr++; + } + } + } + } + pPairBuffStartCurr[unsorted_indx] = make_uint2(start, curr); +// pPairBuffStartCurr[index] = make_uint2(start, curr); + return; +} + + + +__global__ void +btCudaFindOverlappingPairsD( + float4* pAABB, + uint2* pParticleHash, + uint* pCellStart, + uint* pPairBuff, + uint2* pPairBuffStartCurr, + uint numParticles +) +{ + int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; + float4 bbMin = pAABB[index*2]; + float4 bbMax = pAABB[index*2+1]; + float4 pos = (bbMin + bbMax) * 0.5f; + // get address in grid + int3 gridPos = calcGridPos(pos); + // examine only neighbouring cells + for(int z=-1; z<=1; z++) { + for(int y=-1; y<=1; y++) { + for(int x=-1; x<=1; x++) { + findPairsInCell(gridPos + make_int3(x, y, z), index, pParticleHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numParticles); + } + } + } +} + + + +__global__ void +btCudaComputePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan) +{ + int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; + uint2 start_curr = pPairBuffStartCurr[index]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint *pInp = pPairBuff + start; + uint num_changes = 0; + for(uint k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) + { + num_changes++; + } + } + pPairScan[index+1] = num_changes; +} + + +__global__ void +btCudaSqueezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan, uint* pPairOut) +{ + int index = __mul24(blockIdx.x,blockDim.x) + threadIdx.x; + uint2 start_curr = pPairBuffStartCurr[index]; + uint start = start_curr.x; + uint curr = start_curr.y; + uint* pInp = pPairBuff + start; + uint* pOut = pPairOut + pPairScan[index]; + uint* pOut2 = pInp; + uint num = 0; + for(uint k = 0; k < curr; k++, pInp++) + { + if(!((*pInp) & BT_CUDA_PAIR_FOUND_FLG)) + { + *pOut = *pInp; + pOut++; + } + if((*pInp) & BT_CUDA_PAIR_ANY_FLG) + { + *pOut2 = (*pInp) & (~BT_CUDA_PAIR_ANY_FLG); + pOut2++; + num++; + } + } + pPairBuffStartCurr[index] = make_uint2(start, num); +} // btCudaBroadphase::squeezeOverlappingPairBuffCPU_D() + + + #endif diff --git a/Extras/CUDA/particles_kernel.cuh b/Extras/CUDA/particles_kernel.cuh index d7fda0089..34da7825c 100644 --- a/Extras/CUDA/particles_kernel.cuh +++ b/Extras/CUDA/particles_kernel.cuh @@ -14,6 +14,12 @@ #define FETCH(t, i) t[i] #endif + +#define BT_CUDA_PAIR_FOUND_FLG (0x40000000) +#define BT_CUDA_PAIR_NEW_FLG (0x20000000) +#define BT_CUDA_PAIR_ANY_FLG (BT_CUDA_PAIR_FOUND_FLG | BT_CUDA_PAIR_NEW_FLG) + + #include "vector_types.h" typedef unsigned int uint; diff --git a/Extras/CUDA/render_particles.cpp b/Extras/CUDA/render_particles.cpp index 6a419ccd3..ce5bdb3df 100644 --- a/Extras/CUDA/render_particles.cpp +++ b/Extras/CUDA/render_particles.cpp @@ -33,8 +33,12 @@ #include #include +#include "BMF_Api.h" + #include "render_particles.h" #include "shaders.h" +#include "LinearMath/btQuickprof.h" +#include "paramgl.h" #ifndef M_PI #define M_PI 3.1415926535897932384626433832795 @@ -50,6 +54,9 @@ ParticleRenderer::ParticleRenderer() m_colorVBO(0) { _initGL(); +#ifndef BT_NO_PROFILE + m_profileIterator = CProfileManager::Get_Iterator(); +#endif //BT_NO_PROFILE } ParticleRenderer::~ParticleRenderer() @@ -178,3 +185,77 @@ void ParticleRenderer::_initGL() glClampColorARB(GL_CLAMP_FRAGMENT_COLOR_ARB, GL_FALSE); #endif } + +#if 1 +void ParticleRenderer::showProfileInfo(float& xOffset,float& yStart, float yIncr) +{ +#ifndef BT_NO_PROFILE + + static double time_since_reset = 0.f; +// if (!m_idle) + { + time_since_reset = CProfileManager::Get_Time_Since_Reset(); + } + beginWinCoords(); + + { + //recompute profiling data, and store profile strings + + char blockTime[128]; + + double totalTime = 0; + + int frames_since_reset = CProfileManager::Get_Frame_Count_Since_Reset(); + + m_profileIterator->First(); + + double parent_time = m_profileIterator->Is_Root() ? time_since_reset : m_profileIterator->Get_Current_Parent_Total_Time(); + + { + sprintf(blockTime,"--- Profiling: %s (total running time: %.3f ms) ---", m_profileIterator->Get_Current_Parent_Name(), parent_time ); + displayProfileString(xOffset,yStart,blockTime); + yStart += yIncr; + sprintf(blockTime,"press number (1,2...) to display child timings, or 0 to go up to parent" ); + displayProfileString(xOffset,yStart,blockTime); + yStart += yIncr; + + } + double accumulated_time = 0.f; + + for (int i = 0; !m_profileIterator->Is_Done(); m_profileIterator->Next()) + { + double current_total_time = m_profileIterator->Get_Current_Total_Time(); + accumulated_time += current_total_time; + double fraction = parent_time > SIMD_EPSILON ? (current_total_time / parent_time) * 100 : 0.f; + + sprintf(blockTime,"%d -- %s (%.2f %%) :: %.3f ms / frame (%d calls)", + ++i, m_profileIterator->Get_Current_Name(), fraction, + (current_total_time / (double)frames_since_reset),m_profileIterator->Get_Current_Total_Calls()); + displayProfileString(xOffset,yStart,blockTime); + yStart += yIncr; + totalTime += current_total_time; + } + + sprintf(blockTime,"%s (%.3f %%) :: %.3f ms", "Unaccounted", + // (min(0, time_since_reset - totalTime) / time_since_reset) * 100); + parent_time > SIMD_EPSILON ? ((parent_time - accumulated_time) / parent_time) * 100 : 0.f, parent_time - accumulated_time); + + displayProfileString(xOffset,yStart,blockTime); + yStart += yIncr; + sprintf(blockTime,"-------------------------------------------------"); + displayProfileString(xOffset,yStart,blockTime); + yStart += yIncr; + + } + endWinCoords(); +#endif//BT_NO_PROFILE +} + + +void ParticleRenderer::displayProfileString(int xOffset,int yStart,char* message) +{ + glRasterPos3f(xOffset,yStart,0); + BMF_DrawString(BMF_GetFont(BMF_kHelvetica10),message); +} + +#endif \ No newline at end of file diff --git a/Extras/CUDA/render_particles.h b/Extras/CUDA/render_particles.h index c60e6ef7c..5ec788dbc 100644 --- a/Extras/CUDA/render_particles.h +++ b/Extras/CUDA/render_particles.h @@ -30,6 +30,9 @@ #ifndef __RENDER_PARTICLES__ #define __RENDER_PARTICLES__ + +class CProfileIterator; + class ParticleRenderer { public: @@ -55,6 +58,11 @@ public: void setFOV(float fov) { m_fov = fov; } void setWindowSize(int w, int h) { m_window_w = w; m_window_h = h; } + void showProfileInfo(float& xOffset,float& yStart, float yIncr); + void displayProfileString(int xOffset,int yStart,char* message); + class CProfileIterator* m_profileIterator; + + protected: // methods void _initGL(); void _drawPoints();