BulletMultiThreaded optimization: remove some memory copy on non-SPU platforms, using cellDmaGetReadOnly.

This commit is contained in:
erwin.coumans
2008-12-12 22:28:16 +00:00
parent c541414c84
commit 4be20dc347
4 changed files with 254 additions and 239 deletions

View File

@@ -61,7 +61,7 @@ DoubleBuffer<T,size>::backBufferDmaGet(uint64_t ea, unsigned int numBytes, unsig
{
m_dmaPending = true;
m_dmaTag = tag;
cellDmaLargeGet(m_backBuffer, ea, numBytes, tag, 0, 0);
m_backBuffer = (T*)cellDmaLargeGetReadOnly(m_backBuffer, ea, numBytes, tag, 0, 0);
}
template<class T, int size>

View File

@@ -11,199 +11,199 @@ subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#include "SpuFakeDma.h"
#include <LinearMath/btScalar.h> //for btAssert
//Disabling memcpy sometimes helps debugging DMA
#define USE_MEMCPY 1
#ifdef USE_MEMCPY
#endif
void* cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
#if defined (__CELLOS_LV2__) || defined (USE_LIBSPE2)
cellDmaLargeGet(ls,ea,size,tag,tid,rid);
return ls;
#else
return (void*)(uint32_t)ea;
#endif
}
void* cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
#if defined (__SPU__) || defined (USE_LIBSPE2)
mfc_get(ls,ea,size,tag,0,0);
return ls;
#else
return (void*)(uint32_t)ea;
#endif
}
void* cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
#if defined (__SPU__) || defined (USE_LIBSPE2)
cellDmaGet(ls,ea,size,tag,tid,rid);
return ls;
#else
return (void*)(uint32_t)ea;
#endif
}
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
{
btAssert(size<32);
ATTRIBUTE_ALIGNED16(char tmpBuffer[32]);
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
uint32_t i;
///make sure last 4 bits are the same, for cellDmaSmallGet
uint32_t last4BitsOffset = ea & 0x0f;
char* tmpTarget = tmpBuffer + last4BitsOffset;
#if defined (__SPU__) || defined (USE_LIBSPE2)
int remainingSize = size;
//#define FORCE_cellDmaUnalignedGet 1
#ifdef FORCE_cellDmaUnalignedGet
cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
#else
char* remainingTmpTarget = tmpTarget;
uint64_t remainingEa = ea;
while (remainingSize)
{
switch (remainingSize)
{
case 1:
case 2:
case 4:
case 8:
case 16:
{
mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0);
remainingSize=0;
break;
}
default:
{
//spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize);
int actualSize = 0;
if (remainingSize > 16)
actualSize = 16;
else
if (remainingSize >8)
actualSize=8;
else
if (remainingSize >4)
actualSize=4;
else
if (remainingSize >2)
actualSize=2;
mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0);
remainingSize-=actualSize;
remainingTmpTarget+=actualSize;
remainingEa += actualSize;
}
}
}
#endif//FORCE_cellDmaUnalignedGet
#else
//copy into final destination
#ifdef USE_MEMCPY
memcpy(tmpTarget,mainMem,size);
#else
for ( i=0;i<size;i++)
{
tmpTarget[i] = mainMem[i];
}
#endif //USE_MEMCPY
#endif
cellDmaWaitTagStatusAll(DMA_MASK(1));
//this is slowish, perhaps memcpy on SPU is smarter?
for (i=0; btLikely( i<size );i++)
{
localStore[i] = tmpTarget[i];
}
return 0;
}
#if defined (__SPU__) || defined (USE_LIBSPE2)
#else
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif
return 0;
}
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
const char* localStore = (const char*)ls;
#ifdef USE_MEMCPY
memcpy(mainMem,localStore,size);
#else
for (uint32_t i=0;i<size;i++)
{
mainMem[i] = localStore[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
void cellDmaWaitTagStatusAll(int ignore)
{
}
#endif
*/
#include "SpuFakeDma.h"
#include <LinearMath/btScalar.h> //for btAssert
//Disabling memcpy sometimes helps debugging DMA
#define USE_MEMCPY 1
#ifdef USE_MEMCPY
#endif
void* cellDmaLargeGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
#if defined (__SPU__) || defined (USE_LIBSPE2)
cellDmaLargeGet(ls,ea,size,tag,tid,rid);
return ls;
#else
return (void*)(uint32_t)ea;
#endif
}
void* cellDmaSmallGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
#if defined (__SPU__) || defined (USE_LIBSPE2)
mfc_get(ls,ea,size,tag,0,0);
return ls;
#else
return (void*)(uint32_t)ea;
#endif
}
void* cellDmaGetReadOnly(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
#if defined (__SPU__) || defined (USE_LIBSPE2)
cellDmaGet(ls,ea,size,tag,tid,rid);
return ls;
#else
return (void*)(uint32_t)ea;
#endif
}
///this unalignedDma should not be frequently used, only for small data. It handles alignment and performs check on size (<16 bytes)
int stallingUnalignedDmaSmallGet(void *ls, uint64_t ea, uint32_t size)
{
btAssert(size<32);
ATTRIBUTE_ALIGNED16(char tmpBuffer[32]);
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
uint32_t i;
///make sure last 4 bits are the same, for cellDmaSmallGet
uint32_t last4BitsOffset = ea & 0x0f;
char* tmpTarget = tmpBuffer + last4BitsOffset;
#if defined (__SPU__) || defined (USE_LIBSPE2)
int remainingSize = size;
//#define FORCE_cellDmaUnalignedGet 1
#ifdef FORCE_cellDmaUnalignedGet
cellDmaUnalignedGet(tmpTarget,ea,size,DMA_TAG(1),0,0);
#else
char* remainingTmpTarget = tmpTarget;
uint64_t remainingEa = ea;
while (remainingSize)
{
switch (remainingSize)
{
case 1:
case 2:
case 4:
case 8:
case 16:
{
mfc_get(remainingTmpTarget,remainingEa,remainingSize,DMA_TAG(1),0,0);
remainingSize=0;
break;
}
default:
{
//spu_printf("unaligned DMA with non-natural size:%d\n",remainingSize);
int actualSize = 0;
if (remainingSize > 16)
actualSize = 16;
else
if (remainingSize >8)
actualSize=8;
else
if (remainingSize >4)
actualSize=4;
else
if (remainingSize >2)
actualSize=2;
mfc_get(remainingTmpTarget,remainingEa,actualSize,DMA_TAG(1),0,0);
remainingSize-=actualSize;
remainingTmpTarget+=actualSize;
remainingEa += actualSize;
}
}
}
#endif//FORCE_cellDmaUnalignedGet
#else
//copy into final destination
#ifdef USE_MEMCPY
memcpy(tmpTarget,mainMem,size);
#else
for ( i=0;i<size;i++)
{
tmpTarget[i] = mainMem[i];
}
#endif //USE_MEMCPY
#endif
cellDmaWaitTagStatusAll(DMA_MASK(1));
//this is slowish, perhaps memcpy on SPU is smarter?
for (i=0; btLikely( i<size );i++)
{
localStore[i] = tmpTarget[i];
}
return 0;
}
#if defined (__SPU__) || defined (USE_LIBSPE2)
#else
int cellDmaLargeGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif
return 0;
}
int cellDmaGet(void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
char* localStore = (char*)ls;
#ifdef USE_MEMCPY
memcpy(localStore,mainMem,size);
#else
for (uint32_t i=0;i<size;i++)
{
localStore[i] = mainMem[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
int cellDmaLargePut(const void *ls, uint64_t ea, uint32_t size, uint32_t tag, uint32_t tid, uint32_t rid)
{
char* mainMem = (char*)ea;
const char* localStore = (const char*)ls;
#ifdef USE_MEMCPY
memcpy(mainMem,localStore,size);
#else
for (uint32_t i=0;i<size;i++)
{
mainMem[i] = localStore[i];
}
#endif //#ifdef USE_MEMCPY
return 0;
}
void cellDmaWaitTagStatusAll(int ignore)
{
}
#endif

View File

@@ -161,6 +161,9 @@ bool ManifoldResultAddContactPoint(const btVector3& normalOnBInWorld,
void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManifold, btPersistentManifold* mmManifold)
{
///only write back the contact information on SPU. Other platforms avoid copying, and use the data in-place
///see SpuFakeDma.cpp 'cellDmaLargeGetReadOnly'
#if defined (__SPU__) || defined (USE_LIBSPE2)
memcpy(g_manifoldDmaExport.getFront(),lsManifold,sizeof(btPersistentManifold));
g_manifoldDmaExport.swapBuffers();
@@ -168,6 +171,7 @@ void SpuContactResult::writeDoubleBufferedManifold(btPersistentManifold* lsManif
g_manifoldDmaExport.backBufferDmaPut(mmAddr, sizeof(btPersistentManifold), DMA_TAG(9));
// Should there be any kind of wait here? What if somebody tries to use this tag again? What if we call this function again really soon?
//no, the swapBuffers does the wait
#endif
}
void SpuContactResult::addContactPoint(const btVector3& normalOnBInWorld,const btVector3& pointInWorld,float depth)

View File

@@ -122,45 +122,56 @@ bool gUseEpa = false;
///Make sure no destructors are called on this memory
struct CollisionTask_LocalStoreMemory
{
ATTRIBUTE_ALIGNED16(char gColObj0 [sizeof(btCollisionObject)+16]);
ATTRIBUTE_ALIGNED16(char gColObj1 [sizeof(btCollisionObject)+16]);
btCollisionObject* getColObj0()
{
return (btCollisionObject*) gColObj0;
}
btCollisionObject* getColObj1()
{
return (btCollisionObject*) gColObj1;
}
bool needsDmaPutContactManifoldAlgo;
///This CollisionTask_LocalStoreMemory is mainly used for the SPU version, using explicit DMA
///Other platforms can use other memory programming models.
ATTRIBUTE_ALIGNED16(btBroadphasePair gBroadphasePairsBuffer[SPU_BATCHSIZE_BROADPHASE_PAIRS]);
DoubleBuffer<unsigned char, MIDPHASE_WORKUNIT_PAGE_SIZE> g_workUnitTaskBuffers;
ATTRIBUTE_ALIGNED16(btBroadphasePair gBroadphasePairs[SPU_BATCHSIZE_BROADPHASE_PAIRS]);
//SpuContactManifoldCollisionAlgorithm gSpuContactManifoldAlgo;
//ATTRIBUTE_ALIGNED16(char gSpuContactManifoldAlgo[sizeof(SpuContactManifoldCollisionAlgorithm)+128]);
ATTRIBUTE_ALIGNED16(char gSpuContactManifoldAlgo [sizeof(SpuContactManifoldCollisionAlgorithm)+16]);
SpuContactManifoldCollisionAlgorithm* getlocalCollisionAlgorithm()
{
return (SpuContactManifoldCollisionAlgorithm*)&gSpuContactManifoldAlgo;
}
btPersistentManifold gPersistentManifold;
CollisionShape_LocalStoreMemory gCollisionShapes[2];
ATTRIBUTE_ALIGNED16(char gSpuContactManifoldAlgoBuffer [sizeof(SpuContactManifoldCollisionAlgorithm)+16]);
ATTRIBUTE_ALIGNED16(char gColObj0Buffer [sizeof(btCollisionObject)+16]);
ATTRIBUTE_ALIGNED16(char gColObj1Buffer [sizeof(btCollisionObject)+16]);
///we reserve 32bit integer indices, even though they might be 16bit
ATTRIBUTE_ALIGNED16(int spuIndices[16]);
btPersistentManifold gPersistentManifoldBuffer;
CollisionShape_LocalStoreMemory gCollisionShapes[2];
bvhMeshShape_LocalStoreMemory bvhShapeData;
SpuConvexPolyhedronVertexData convexVertexData[2];
CompoundShape_LocalStoreMemory compoundShapeData[2];
///The following pointers might either point into this local store memory, or to the original/other memory locations.
///See SpuFakeDma for implementation of cellDmaSmallGetReadOnly.
btCollisionObject* m_lsColObj0Ptr;
btCollisionObject* m_lsColObj1Ptr;
btBroadphasePair* m_pairsPointer;
btPersistentManifold* m_lsManifoldPtr;
SpuContactManifoldCollisionAlgorithm* m_lsCollisionAlgorithmPtr;
bool needsDmaPutContactManifoldAlgo;
btCollisionObject* getColObj0()
{
return m_lsColObj0Ptr;
}
btCollisionObject* getColObj1()
{
return m_lsColObj1Ptr;
}
btBroadphasePair* getBroadphasePairPtr()
{
return m_pairsPointer;
}
SpuContactManifoldCollisionAlgorithm* getlocalCollisionAlgorithm()
{
return m_lsCollisionAlgorithmPtr;
}
btPersistentManifold* getContactManifoldPtr()
{
return m_lsManifoldPtr;
}
};
@@ -560,12 +571,12 @@ void ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
cpInput.m_convexVertexData[1] = &lsMemPtr->convexVertexData[1];
cpInput.m_transformA = wuInput->m_worldTransform0;
cpInput.m_transformB = wuInput->m_worldTransform1;
float sumMargin = (marginA+marginB+lsMemPtr->gPersistentManifold.getContactBreakingThreshold());
float sumMargin = (marginA+marginB+lsMemPtr->getContactManifoldPtr()->getContactBreakingThreshold());
cpInput.m_maximumDistanceSquared = sumMargin * sumMargin;
ppu_address_t manifoldAddress = (ppu_address_t)manifold;
btPersistentManifold* spuManifold=&lsMemPtr->gPersistentManifold;
btPersistentManifold* spuManifold=lsMemPtr->getContactManifoldPtr();
//spuContacts.setContactInfo(spuManifold,manifoldAddress,wuInput->m_worldTransform0,wuInput->m_worldTransform1,wuInput->m_isSwapped);
spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMemPtr->getColObj0()->getWorldTransform(),
lsMemPtr->getColObj1()->getWorldTransform(),
@@ -605,11 +616,11 @@ SIMD_FORCE_INLINE void dmaAndSetupCollisionObjects(SpuCollisionPairInput& collis
dmaSize = sizeof(btCollisionObject);//btTransform);
dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr1->m_clientObject :*/ (ppu_address_t)lsMem.getlocalCollisionAlgorithm()->getCollisionObject0();
cellDmaGet(&lsMem.gColObj0, dmaPpuAddress2 , dmaSize, DMA_TAG(1), 0, 0);
lsMem.m_lsColObj0Ptr = (btCollisionObject*)cellDmaGetReadOnly(&lsMem.gColObj0Buffer, dmaPpuAddress2 , dmaSize, DMA_TAG(1), 0, 0);
dmaSize = sizeof(btCollisionObject);//btTransform);
dmaPpuAddress2 = /*collisionPairInput.m_isSwapped ? (ppu_address_t)lsMem.gProxyPtr0->m_clientObject :*/ (ppu_address_t)lsMem.getlocalCollisionAlgorithm()->getCollisionObject1();
cellDmaGet(&lsMem.gColObj1, dmaPpuAddress2 , dmaSize, DMA_TAG(2), 0, 0);
lsMem.m_lsColObj1Ptr = (btCollisionObject*)cellDmaGetReadOnly(&lsMem.gColObj1Buffer, dmaPpuAddress2 , dmaSize, DMA_TAG(2), 0, 0);
cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
@@ -899,7 +910,7 @@ void processCollisionTask(void* userPtr, void* lsMemPtr)
{
dmaSize = numPairs*sizeof(btBroadphasePair);
dmaPpuAddress = wuInputs[j].m_pairArrayPtr+wuInputs[j].m_startIndex * sizeof(btBroadphasePair);
cellDmaGet(&lsMem.gBroadphasePairs, dmaPpuAddress , dmaSize, DMA_TAG(1), 0, 0);
lsMem.m_pairsPointer = (btBroadphasePair*)cellDmaGetReadOnly(&lsMem.gBroadphasePairsBuffer, dmaPpuAddress , dmaSize, DMA_TAG(1), 0, 0);
cellDmaWaitTagStatusAll(DMA_MASK(1));
@@ -908,7 +919,7 @@ void processCollisionTask(void* userPtr, void* lsMemPtr)
//for each broadphase pair, do something
btBroadphasePair& pair = lsMem.gBroadphasePairs[p];
btBroadphasePair& pair = lsMem.getBroadphasePairPtr()[p];
#ifdef DEBUG_SPU_COLLISION_DETECTION
spu_printf("pair->m_userInfo = %d\n",pair.m_userInfo);
spu_printf("pair->m_algorithm = %d\n",pair.m_algorithm);
@@ -920,7 +931,7 @@ void processCollisionTask(void* userPtr, void* lsMemPtr)
{
dmaSize = sizeof(SpuContactManifoldCollisionAlgorithm);
dmaPpuAddress2 = (ppu_address_t)pair.m_algorithm;
cellDmaGet(&lsMem.gSpuContactManifoldAlgo, dmaPpuAddress2 , dmaSize, DMA_TAG(1), 0, 0);
lsMem.m_lsCollisionAlgorithmPtr = (SpuContactManifoldCollisionAlgorithm*)cellDmaGetReadOnly(&lsMem.gSpuContactManifoldAlgoBuffer, dmaPpuAddress2 , dmaSize, DMA_TAG(1), 0, 0);
cellDmaWaitTagStatusAll(DMA_MASK(1));
@@ -944,7 +955,7 @@ void processCollisionTask(void* userPtr, void* lsMemPtr)
dmaSize = sizeof(btPersistentManifold);
dmaPpuAddress2 = collisionPairInput.m_persistentManifoldPtr;
cellDmaGet(&lsMem.gPersistentManifold, dmaPpuAddress2 , dmaSize, DMA_TAG(1), 0, 0);
lsMem.m_lsManifoldPtr = (btPersistentManifold*)cellDmaGetReadOnly(&lsMem.gPersistentManifoldBuffer, dmaPpuAddress2 , dmaSize, DMA_TAG(1), 0, 0);
collisionPairInput.m_shapeType0 = lsMem.getlocalCollisionAlgorithm()->getShapeType0();
collisionPairInput.m_shapeType1 = lsMem.getlocalCollisionAlgorithm()->getShapeType1();
@@ -977,7 +988,7 @@ void processCollisionTask(void* userPtr, void* lsMemPtr)
if (boxbox)
{
//spu_printf("boxbox dist = %f\n",distance);
btPersistentManifold* spuManifold=&lsMem.gPersistentManifold;
btPersistentManifold* spuManifold=lsMem.getContactManifoldPtr();
btPersistentManifold* manifold = (btPersistentManifold*)collisionPairInput.m_persistentManifoldPtr;
ppu_address_t manifoldAddress = (ppu_address_t)manifold;
@@ -1107,7 +1118,7 @@ void processCollisionTask(void* userPtr, void* lsMemPtr)
} else
{
//spu_printf("boxbox dist = %f\n",distance);
btPersistentManifold* spuManifold=&lsMem.gPersistentManifold;
btPersistentManifold* spuManifold=lsMem.getContactManifoldPtr();
btPersistentManifold* manifold = (btPersistentManifold*)collisionPairInput.m_persistentManifoldPtr;
ppu_address_t manifoldAddress = (ppu_address_t)manifold;