experiment with first-level batching using object index instead of spatial hash in uniform grid (to avoid tuning average object size for uniform grid)

2013-05-03 01:14:34 -07:00
parent 6ee9eb9bb5
commit 1185de51d5
7 changed files with 211 additions and 15 deletions
--- a/Demos3/BasicGpuDemo/BasicGpuDemo.cpp
+++ b/Demos3/BasicGpuDemo/BasicGpuDemo.cpp
@@ -15,9 +15,9 @@ subject to the following restrictions:


 ///create 125 (5x5x5) dynamic object
-#define ARRAY_SIZE_X 5
-#define ARRAY_SIZE_Y 5
-#define ARRAY_SIZE_Z 5
+#define ARRAY_SIZE_X 30
+#define ARRAY_SIZE_Y 20
+#define ARRAY_SIZE_Z 30

 //maximum number of objects (and allow user to shoot additional boxes)
 #define MAX_PROXIES (ARRAY_SIZE_X*ARRAY_SIZE_Y*ARRAY_SIZE_Z + 1024)
@@ -164,7 +164,7 @@ void BasicGpuDemo::exitCL()
 BasicGpuDemo::BasicGpuDemo()
 {
 	m_clData = new btInternalData;
-	setCameraDistance(btScalar(SCALING*20.));
+	setCameraDistance(btScalar(SCALING*120.));
 	this->setAzi(45);
 	this->setEle(45);

@@ -222,7 +222,7 @@ void	BasicGpuDemo::initPhysics()
 	m_dynamicsWorld->setGravity(btVector3(0,-10,0));

 	///create a few basic rigid bodies
-	btBoxShape* groundShape = new btBoxShape(btVector3(btScalar(50.),btScalar(50.),btScalar(50.)));
+	btBoxShape* groundShape = new btBoxShape(btVector3(btScalar(150.),btScalar(50.),btScalar(150.)));
 	//groundShape->initializePolyhedralFeatures();
 //	btCollisionShape* groundShape = new btStaticPlaneShape(btVector3(0,1,0),50);
 	
--- a/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp
+++ b/Demos3/GpuDemos/rigidbody/GpuRigidBodyDemo.cpp
@@ -119,6 +119,7 @@ void	GpuRigidBodyDemo::initPhysics(const ConstructionInfo& ci)

 		setupScene(ci);

+		m_data->m_rigidBodyPipeline->writeAllInstancesToGpu();
 		np->writeAllBodiesToGpu();
 		bp->writeAabbsToGpu();

--- a/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp
@@ -424,7 +424,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
        float dt=1./60.;
        b3ConstraintCfg csCfg( dt );
        csCfg.m_enableParallelSolve = true;
-        csCfg.m_averageExtent = .2f;//@TODO m_averageObjExtent;
+        csCfg.m_averageExtent = 0.3;//0.1;//2;//.2f;//@TODO m_averageObjExtent;
        csCfg.m_staticIdx = static0Index;
        
        
@@ -516,8 +516,8 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
                        if (gpuRadixSort)
                        {	//	3. sort by cell idx
                            B3_PROFILE("gpuRadixSort");
-                            int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
-                            int sortBit = 32;
+                            //int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
+                            //int sortBit = 32;
                            //if( n <= 0xffff ) sortBit = 16;
                            //if( n <= 0xff ) sortBit = 8;
                            //adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
@@ -581,6 +581,12 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
                
                clFinish(m_data->m_queue);
                
+//				{
+//				b3AlignedObjectArray<unsigned int> histogram;
+//				m_data->m_solverGPU->m_numConstraints->copyToHost(histogram);
+//				printf(",,,\n");
+//				}
+							
 				
 				if (nContacts)
 				{
--- a/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
@@ -551,6 +551,75 @@ void b3Solver::solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyCL>* bodyBu
 	
 }

+void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyCL>* bodyBuf,
+					const b3OpenCLArray<b3InertiaCL>* shapeBuf,
+					b3OpenCLArray<b3GpuConstraint4>* constraint, 
+					b3OpenCLArray<unsigned int>* m_numConstraints,
+					b3OpenCLArray<unsigned int>* m_offsets,
+					int batchId
+					)
+{
+//						b3BufferInfoCL( m_numConstraints->getBufferCL() ), 
+//						b3BufferInfoCL( m_offsets->getBufferCL() ) 
+	
+	const int nn = b3SolverBase::N_SPLIT*b3SolverBase::N_SPLIT;
+	int numWorkItems = 64*nn/b3SolverBase::N_BATCHES;
+
+	b3AlignedObjectArray<unsigned int> gN;
+	m_numConstraints->copyToHost(gN);
+	b3AlignedObjectArray<unsigned int> gOffsets;
+	m_offsets->copyToHost(gOffsets);
+	int nSplit = b3SolverBase::N_SPLIT;
+	int bIdx = batchId;
+
+	b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
+	constraint->copyToHost(cpuConstraints);
+
+	printf("batch = %d\n", batchId);
+
+	int numWorkgroups = nn/b3SolverBase::N_BATCHES;
+	b3AlignedObjectArray<int> usedBodies;
+
+
+	for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
+	{
+		printf("wgIdx = %d           ", wgIdx);
+		int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);
+		int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);
+		int cellIdx = xIdx+yIdx*nSplit;
+		printf("cellIdx=%d\n",cellIdx);
+		if( gN[cellIdx] == 0 ) 
+			continue;
+
+		const int start = gOffsets[cellIdx];
+		const int end = start + gN[cellIdx];
+
+		for (int c=start;c<end;c++)
+		{
+			b3GpuConstraint4& constraint = cpuConstraints[c];
+			//printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
+			if (usedBodies.findLinearSearch(constraint.m_bodyA)< usedBodies.size())
+			{
+				printf("error?\n");
+			}
+			if (usedBodies.findLinearSearch(constraint.m_bodyB)< usedBodies.size())
+			{
+				printf("error?\n");
+			}
+		}
+
+		for (int c=start;c<end;c++)
+		{
+			b3GpuConstraint4& constraint = cpuConstraints[c];
+			usedBodies.push_back(constraint.m_bodyA);
+			usedBodies.push_back(constraint.m_bodyB);
+		}
+
+	}
+}
+
+static bool verify=false;
+
 void b3Solver::solveContactConstraint(  const b3OpenCLArray<b3RigidBodyCL>* bodyBuf, const b3OpenCLArray<b3InertiaCL>* shapeBuf, 
 			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches)
 {
@@ -580,6 +649,12 @@ void b3Solver::solveContactConstraint(  const b3OpenCLArray<b3RigidBodyCL>* body
 			{
 				for(int ib=0; ib<N_BATCHES; ib++)
 				{
+					
+					if (verify)
+					{
+						checkConstraintBatch(bodyBuf,shapeBuf,constraint,m_numConstraints,m_offsets,ib);
+					}
+
 #ifdef DEBUG_ME
 					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
 					gpuDebugInfo.write(debugInfo,numWorkItems);
--- a/src/Bullet3OpenCL/RigidBody/b3Solver.h
+++ b/src/Bullet3OpenCL/RigidBody/b3Solver.h
@@ -54,7 +54,7 @@ class b3SolverBase
 		enum
 		{
 			N_SPLIT = 16,
-			N_BATCHES = 4,
+			N_BATCHES = 4,//8,//4,
 			N_OBJ_PER_SPLIT = 10,
 			N_TASKS_PER_BATCH = N_SPLIT*N_SPLIT,
 		};
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
@@ -449,6 +449,34 @@ typedef struct
 	int m_nSplit;
 } ConstBufferSSD;

+
+static const int gridTable4x4[] = 
+{
+    0,1,17,16,
+	1,2,18,19,
+	17,18,32,3,
+	16,19,3,34
+};
+
+static const int gridTable8x8[] = 
+{
+	  0,  2,  3, 16, 17, 18, 19,  1,
+	 66, 64, 80, 67, 82, 81, 65, 83,
+	131,144,128,130,147,129,145,146,
+	208,195,194,192,193,211,210,209,
+	 21, 22, 23,  5,  4,  6,  7, 20,
+	 86, 85, 69, 87, 70, 68, 84, 71,
+	151,133,149,150,135,148,132,134,
+	197,27,214,213,212,199,198,196
+	
+};
+
+
+
+
+#define USE_SPATIAL_BATCHING 1
+#define USE_4x4_GRID 1
+
 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
 void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, 
@@ -460,18 +488,47 @@ int nContacts,float scale,int N_SPLIT, int staticIdx)
 	if( gIdx < nContacts )
 	{
 		int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;
+		int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;

 		int aIdx = abs(aPtrAndSignBit );
-		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
+		int bIdx = abs(bPtrAndSignBit);

 		bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);
+		bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);

+#if USE_SPATIAL_BATCHING		
 		int idx = (aStatic)? bIdx: aIdx;
 		float4 p = gBodies[idx].m_pos;
 		int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);
 		int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);
+		int newIndex = (xIdx+zIdx*N_SPLIT);
 		
-		gSortDataOut[gIdx].x = (xIdx+zIdx*N_SPLIT);
+#else//USE_SPATIAL_BATCHING
+	#if USE_4x4_GRID
+		int aa = aIdx&3;
+		int bb = bIdx&3;
+		if (aStatic)
+			aa = bb;
+		if (bStatic)
+			bb = aa;
+
+		int gridIndex = aa + bb*4;
+		int newIndex = gridTable4x4[gridIndex];
+	#else//USE_4x4_GRID
+		int aa = aIdx&7;
+		int bb = bIdx&7;
+		if (aStatic)
+			aa = bb;
+		if (bStatic)
+			bb = aa;
+
+		int gridIndex = aa + bb*8;
+		int newIndex = gridTable8x8[gridIndex];
+	#endif//USE_4x4_GRID
+#endif//USE_SPATIAL_BATCHING
+
+
+		gSortDataOut[gIdx].x = newIndex;
 		gSortDataOut[gIdx].y = gIdx;
 	}
 	else
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
@@ -451,6 +451,34 @@ static const char* solverSetup2CL= \
 "	int m_nSplit;\n"
 "} ConstBufferSSD;\n"
 "\n"
+"\n"
+"static const int gridTable4x4[] = \n"
+"{\n"
+"    0,1,17,16,\n"
+"	1,2,18,19,\n"
+"	17,18,32,3,\n"
+"	16,19,3,34\n"
+"};\n"
+"\n"
+"static const int gridTable8x8[] = \n"
+"{\n"
+"	  0,  2,  3, 16, 17, 18, 19,  1,\n"
+"	 66, 64, 80, 67, 82, 81, 65, 83,\n"
+"	131,144,128,130,147,129,145,146,\n"
+"	208,195,194,192,193,211,210,209,\n"
+"	 21, 22, 23,  5,  4,  6,  7, 20,\n"
+"	 86, 85, 69, 87, 70, 68, 84, 71,\n"
+"	151,133,149,150,135,148,132,134,\n"
+"	197,27,214,213,212,199,198,196\n"
+"	\n"
+"};\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"#define USE_SPATIAL_BATCHING 1\n"
+"#define USE_4x4_GRID 1\n"
+"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n"
@@ -462,18 +490,47 @@ static const char* solverSetup2CL= \
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;\n"
+"		int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;\n"
 "\n"
 "		int aIdx = abs(aPtrAndSignBit );\n"
-"		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
+"		int bIdx = abs(bPtrAndSignBit);\n"
 "\n"
 "		bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n"
+"		bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n"
 "\n"
+"#if USE_SPATIAL_BATCHING		\n"
 "		int idx = (aStatic)? bIdx: aIdx;\n"
 "		float4 p = gBodies[idx].m_pos;\n"
 "		int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n"
 "		int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*scale) & (N_SPLIT-1);\n"
+"		int newIndex = (xIdx+zIdx*N_SPLIT);\n"
 "		\n"
-"		gSortDataOut[gIdx].x = (xIdx+zIdx*N_SPLIT);\n"
+"#else//USE_SPATIAL_BATCHING\n"
+"	#if USE_4x4_GRID\n"
+"		int aa = aIdx&3;\n"
+"		int bb = bIdx&3;\n"
+"		if (aStatic)\n"
+"			aa = bb;\n"
+"		if (bStatic)\n"
+"			bb = aa;\n"
+"\n"
+"		int gridIndex = aa + bb*4;\n"
+"		int newIndex = gridTable4x4[gridIndex];\n"
+"	#else//USE_4x4_GRID\n"
+"		int aa = aIdx&7;\n"
+"		int bb = bIdx&7;\n"
+"		if (aStatic)\n"
+"			aa = bb;\n"
+"		if (bStatic)\n"
+"			bb = aa;\n"
+"\n"
+"		int gridIndex = aa + bb*8;\n"
+"		int newIndex = gridTable8x8[gridIndex];\n"
+"	#endif//USE_4x4_GRID\n"
+"#endif//USE_SPATIAL_BATCHING\n"
+"\n"
+"\n"
+"		gSortDataOut[gIdx].x = newIndex;\n"
 "		gSortDataOut[gIdx].y = gIdx;\n"
 "	}\n"
 "	else\n"