From 08d036afc7b5b70841a0e703f2d682cccdc0bfec Mon Sep 17 00:00:00 2001
From: ejcoumans <ejcoumans@08e121b0-ae19-0410-a57b-3be3395fd4fd>
Date: Thu, 20 Sep 2007 22:42:56 +0000
Subject: [PATCH] multi-threading improvements: optionally use software caching
 (IBM/PS3), move some code from cpp to header to inline.

---
 .../SpuGatheringCollisionTask.cpp             | 121 +++++++++++++++---
 .../CollisionDispatch/btCollisionObject.h     |   4 +-
 .../CollisionShapes/btOptimizedBvh.cpp        |  13 --
 .../CollisionShapes/btOptimizedBvh.h          |  20 ++-
 4 files changed, 120 insertions(+), 38 deletions(-)
diff --git a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
index d5e2f3e9a..cff34e2a2 100644
--- a/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
+++ b/Extras/BulletMultiThreaded/SpuNarrowPhaseCollisionTask/SpuGatheringCollisionTask.cpp
@@ -27,6 +27,59 @@
 
 #include "SpuLocalSupport.h" //definition of SpuConvexPolyhedronVertexData
 
+#ifdef __CELLOS_LV2__
+///Software caching from the IBM Cell SDK, it reduces 25% SPU time for our test cases
+#define USE_SOFTWARE_CACHE 1
+#endif //__CELLOS_LV2__
+
+////////////////////////////////////////////////
+/// software caching
+#if USE_SOFTWARE_CACHE
+#include <spu_intrinsics.h>
+#include <sys/spu_thread.h>
+#include <sys/spu_event.h>
+#include <stdint.h>
+#define SPE_CACHE_NWAY   		4
+//#define SPE_CACHE_NSETS 		32, 16
+#define SPE_CACHE_NSETS 		8
+//#define SPE_CACHELINE_SIZE 		512
+#define SPE_CACHELINE_SIZE 		128
+#define SPE_CACHE_SET_TAGID(set) 	15
+///make sure that spe_cache.h is below those defines!
+#include "spe_cache.h"
+
+
+int g_CacheMisses=0;
+int g_CacheHits=0;
+
+#if 0 // Added to allow cache misses and hits to be tracked, change this to 1 to restore unmodified version
+#define spe_cache_read(ea)		_spe_cache_lookup_xfer_wait_(ea, 0, 1)
+#else
+#define spe_cache_read(ea)		\
+({								\
+    int set, idx, line, byte;					\
+    _spe_cache_nway_lookup_(ea, set, idx);			\
+								\
+    if (unlikely(idx < 0)) {					\
+        ++g_CacheMisses;                        \
+	    idx = _spe_cache_miss_(ea, set, -1);			\
+        spu_writech(22, SPE_CACHE_SET_TAGMASK(set));		\
+        spu_mfcstat(MFC_TAG_UPDATE_ALL);			\
+    } 								\
+    else                            \
+    {                               \
+        ++g_CacheHits;              \
+    }                               \
+    line = _spe_cacheline_num_(set, idx);			\
+    byte = _spe_cacheline_byte_offset_(ea);			\
+    (void *) &spe_cache_mem[line + byte];			\
+})
+
+#endif
+
+#endif // USE_SOFTWARE_CACHE
+
+
 #ifdef USE_SN_TUNER
 #include <LibSN_SPU.h>
 #endif //USE_SN_TUNER
@@ -55,8 +108,19 @@ struct	CollisionTask_LocalStoreMemory
 	ATTRIBUTE_ALIGNED16(btBroadphaseProxy*	gProxyPtr0);
 	ATTRIBUTE_ALIGNED16(btBroadphaseProxy*	gProxyPtr1);
 
-	ATTRIBUTE_ALIGNED16(btCollisionObject	gColObj0);
-	ATTRIBUTE_ALIGNED16(btCollisionObject	gColObj1);
+	//ATTRIBUTE_ALIGNED16(btCollisionObject	gColObj0);
+	//ATTRIBUTE_ALIGNED16(btCollisionObject	gColObj1);
+	ATTRIBUTE_ALIGNED16(char gColObj0 [sizeof(btCollisionObject)+16]);
+	ATTRIBUTE_ALIGNED16(char gColObj1 [sizeof(btCollisionObject)+16]);
+	
+	btCollisionObject* getColObj0()
+	{
+		return (btCollisionObject*) gColObj0;
+	}
+	btCollisionObject* getColObj1()
+	{
+		return (btCollisionObject*) gColObj1;
+	}
 
 	DoubleBuffer<unsigned char, MIDPHASE_WORKUNIT_PAGE_SIZE> g_workUnitTaskBuffers;
 	ATTRIBUTE_ALIGNED16(btBroadphasePair	gBroadphasePairs[SPU_BATCHSIZE_BROADPHASE_PAIRS]);
@@ -79,7 +143,13 @@ struct	CollisionTask_LocalStoreMemory
 
 	ATTRIBUTE_ALIGNED16(int	spuIndices[16]);
 
-	ATTRIBUTE_ALIGNED16(btOptimizedBvh	gOptimizedBvh);
+	//ATTRIBUTE_ALIGNED16(btOptimizedBvh	gOptimizedBvh);
+	ATTRIBUTE_ALIGNED16(char gOptimizedBvh[sizeof(btOptimizedBvh)+16]);
+	btOptimizedBvh*	getOptimizedBvh()
+	{
+		return (btOptimizedBvh*) gOptimizedBvh;
+	}
+
 	ATTRIBUTE_ALIGNED16(btTriangleIndexVertexArray	gTriangleMeshInterface);
 	///only a single mesh part for now, we can add support for multiple parts, but quantized trees don't support this at the moment 
 	ATTRIBUTE_ALIGNED16(btIndexedMesh	gIndexMesh);
@@ -120,15 +190,26 @@ void* createCollisionLocalStoreMemory()
 
 void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTask_LocalStoreMemory* lsMemPtr, SpuContactResult& spuContacts);
 
-
-inline bool spuTestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int*  aabbMax2)
+#define USE_BRANCHFREE_TEST 1
+#ifdef USE_BRANCHFREE_TEST
+unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(unsigned short int* aabbMin1,unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int* aabbMax2)
+{		
+	return btSelect((unsigned)((aabbMin1[0] <= aabbMax2[0]) & (aabbMax1[0] >= aabbMin2[0])
+		& (aabbMin1[2] <= aabbMax2[2]) & (aabbMax1[2] >= aabbMin2[2])
+		& (aabbMin1[1] <= aabbMax2[1]) & (aabbMax1[1] >= aabbMin2[1])),
+		1, 0);
+}
+#else
+unsigned int spuTestQuantizedAabbAgainstQuantizedAabb(const unsigned short int* aabbMin1,const unsigned short int* aabbMax1,const unsigned short int* aabbMin2,const unsigned short int*  aabbMax2)
 {
-	bool overlap = true;
-	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? false : overlap;
-	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? false : overlap;
-	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? false : overlap;
+	unsigned int overlap = 1;
+	overlap = (aabbMin1[0] > aabbMax2[0] || aabbMax1[0] < aabbMin2[0]) ? 0 : overlap;
+	overlap = (aabbMin1[2] > aabbMax2[2] || aabbMax1[2] < aabbMin2[2]) ? 0 : overlap;
+	overlap = (aabbMin1[1] > aabbMax2[1] || aabbMax1[1] < aabbMin2[1]) ? 0 : overlap;
 	return overlap;
 }
+#endif
+
 
 
 void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,const btQuantizedBvhNode* rootNode,int startNodeIndex,int endNodeIndex)
@@ -140,7 +221,7 @@ void	spuWalkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned
 
 	int escapeIndex;
 
-	bool aabbOverlap, isLeafNode;
+	unsigned int aabbOverlap, isLeafNode;
 
 	while (curIndex < endNodeIndex)
 	{
@@ -406,13 +487,13 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 	///quantize query AABB
 	unsigned short int quantizedQueryAabbMin[3];
 	unsigned short int quantizedQueryAabbMax[3];
-	lsMemPtr->gOptimizedBvh.quantizeWithClamp(quantizedQueryAabbMin,aabbMin);
-	lsMemPtr->gOptimizedBvh.quantizeWithClamp(quantizedQueryAabbMax,aabbMax);
+	lsMemPtr->getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMin,aabbMin);
+	lsMemPtr->getOptimizedBvh()->quantizeWithClamp(quantizedQueryAabbMax,aabbMax);
 
-	QuantizedNodeArray&	nodeArray = lsMemPtr->gOptimizedBvh.getQuantizedNodeArray();
+	QuantizedNodeArray&	nodeArray = lsMemPtr->getOptimizedBvh()->getQuantizedNodeArray();
 	//spu_printf("SPU: numNodes = %d\n",nodeArray.size());
 
-	BvhSubtreeInfoArray& subTrees = lsMemPtr->gOptimizedBvh.getSubtreeInfoArray();
+	BvhSubtreeInfoArray& subTrees = lsMemPtr->getOptimizedBvh()->getSubtreeInfoArray();
 
 	spuNodeCallback	nodeCallback(wuInput,lsMemPtr,spuContacts);
 	IndexedMeshArray&	indexArray = lsMemPtr->gTriangleMeshInterface.getIndexedMeshArray();
@@ -454,7 +535,7 @@ void	ProcessConvexConcaveSpuCollision(SpuCollisionPairInput* wuInput, CollisionT
 			{
 				const btBvhSubtreeInfo& subtree = lsMemPtr->gSubtreeHeaders[j];
 
-				bool overlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+				unsigned int overlap = spuTestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
 				if (overlap)
 				{
 					btAssert(subtree.m_subtreeSize);
@@ -674,7 +755,7 @@ void	ProcessSpuConvexConvexCollision(SpuCollisionPairInput* wuInput, CollisionTa
 		uint64_t manifoldAddress = (uint64_t)manifold;
 		btPersistentManifold* spuManifold=&lsMemPtr->gPersistentManifold;
 		//spuContacts.setContactInfo(spuManifold,manifoldAddress,wuInput->m_worldTransform0,wuInput->m_worldTransform1,wuInput->m_isSwapped);
-		spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMemPtr->gColObj0.getWorldTransform(),lsMemPtr->gColObj1.getWorldTransform(),wuInput->m_isSwapped);
+		spuContacts.setContactInfo(spuManifold,manifoldAddress,lsMemPtr->getColObj0()->getWorldTransform(),lsMemPtr->getColObj1()->getWorldTransform(),wuInput->m_isSwapped);
 
 		SpuGjkPairDetector gjk(shape0Ptr,shape1Ptr,shapeType0,shapeType1,marginA,marginB,&vsSolver,&penetrationSolver);
 		gjk.getClosestPoints(cpInput,spuContacts);//,debugDraw);
@@ -707,8 +788,8 @@ void	dmaAndSetupCollisionObjects(SpuCollisionPairInput& collisionPairInput, Coll
 
 	cellDmaWaitTagStatusAll(DMA_MASK(1) | DMA_MASK(2));
 
-	collisionPairInput.m_worldTransform0 = lsMem.gColObj0.getWorldTransform();
-	collisionPairInput.m_worldTransform1 = lsMem.gColObj1.getWorldTransform();
+	collisionPairInput.m_worldTransform0 = lsMem.getColObj0()->getWorldTransform();
+	collisionPairInput.m_worldTransform1 = lsMem.getColObj1()->getWorldTransform();
 
 
 
@@ -1174,8 +1255,8 @@ void	processCollisionTask(void* userPtr, void* lsMemPtr)
 								dmaAndSetupCollisionObjects(collisionPairInput, lsMem);
 
 								handleCollisionPair(collisionPairInput, lsMem, spuContacts, 
-									(uint64_t)lsMem.gColObj0.getCollisionShape(), lsMem.gCollisionShape0,
-									(uint64_t)lsMem.gColObj1.getCollisionShape(), lsMem.gCollisionShape1);
+									(uint64_t)lsMem.getColObj0()->getCollisionShape(), lsMem.gCollisionShape0,
+									(uint64_t)lsMem.getColObj1()->getCollisionShape(), lsMem.gCollisionShape1);
 							}		
 						}
 
diff --git a/src/BulletCollision/CollisionDispatch/btCollisionObject.h b/src/BulletCollision/CollisionDispatch/btCollisionObject.h
index 3b7b04d16..7d50d1d60 100644
--- a/src/BulletCollision/CollisionDispatch/btCollisionObject.h
+++ b/src/BulletCollision/CollisionDispatch/btCollisionObject.h
@@ -133,12 +133,12 @@ public:
 		m_collisionShape = collisionShape;
 	}
 
-	const btCollisionShape*	getCollisionShape() const
+	inline const btCollisionShape*	getCollisionShape() const
 	{
 		return m_collisionShape;
 	}
 
-	btCollisionShape*	getCollisionShape()
+	inline btCollisionShape*	getCollisionShape()
 	{
 		return m_collisionShape;
 	}
diff --git a/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp b/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
index 5b1f09f62..7065d4e11 100644
--- a/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
+++ b/src/BulletCollision/CollisionShapes/btOptimizedBvh.cpp
@@ -834,20 +834,7 @@ void	btOptimizedBvh::reportSphereOverlappingNodex(btNodeOverlapCallback* nodeCal
 }
 
 
-void btOptimizedBvh::quantizeWithClamp(unsigned short* out, const btVector3& point) const
-{
 
-	btAssert(m_useQuantization);
-
-	btVector3 clampedPoint(point);
-	clampedPoint.setMax(m_bvhAabbMin);
-	clampedPoint.setMin(m_bvhAabbMax);
-
-	btVector3 v = (clampedPoint - m_bvhAabbMin) * m_bvhQuantization;
-	out[0] = (unsigned short)(v.getX()+0.5f);
-	out[1] = (unsigned short)(v.getY()+0.5f);
-	out[2] = (unsigned short)(v.getZ()+0.5f);		
-}
 
 btVector3	btOptimizedBvh::unQuantize(const unsigned short* vecIn) const
 {
diff --git a/src/BulletCollision/CollisionShapes/btOptimizedBvh.h b/src/BulletCollision/CollisionShapes/btOptimizedBvh.h
index b4b0cdb97..d66f686d8 100644
--- a/src/BulletCollision/CollisionShapes/btOptimizedBvh.h
+++ b/src/BulletCollision/CollisionShapes/btOptimizedBvh.h
@@ -307,8 +307,22 @@ public:
 	void	reportAabbOverlappingNodex(btNodeOverlapCallback* nodeCallback,const btVector3& aabbMin,const btVector3& aabbMax) const;
 
 	void	reportSphereOverlappingNodex(btNodeOverlapCallback* nodeCallback,const btVector3& aabbMin,const btVector3& aabbMax) const;
+	
+	inline void quantizeWithClamp(unsigned short* out, const btVector3& point) const
+	{
+
+		btAssert(m_useQuantization);
+
+		btVector3 clampedPoint(point);
+		clampedPoint.setMax(m_bvhAabbMin);
+		clampedPoint.setMin(m_bvhAabbMax);
+
+		btVector3 v = (clampedPoint - m_bvhAabbMin) * m_bvhQuantization;
+		out[0] = (unsigned short)(v.getX()+0.5f);
+		out[1] = (unsigned short)(v.getY()+0.5f);
+		out[2] = (unsigned short)(v.getZ()+0.5f);		
+	}
 
-	void quantizeWithClamp(unsigned short* out, const btVector3& point) const;
 	
 	btVector3	unQuantize(const unsigned short* vecIn) const;
 
@@ -325,12 +339,12 @@ public:
 	void	updateBvhNodes(btStridingMeshInterface* meshInterface,int firstNode,int endNode,int index);
 
 
-	QuantizedNodeArray&	getQuantizedNodeArray()
+	inline QuantizedNodeArray&	getQuantizedNodeArray()
 	{	
 		return	m_quantizedContiguousNodes;
 	}
 
-	BvhSubtreeInfoArray&	getSubtreeInfoArray()
+	inline BvhSubtreeInfoArray&	getSubtreeInfoArray()
 	{
 		return m_SubtreeHeaders;
 	}