From b8d5cecfe322d77433c3ddc4146f00ed620fbfc8 Mon Sep 17 00:00:00 2001
From: erwin coumans <erwin.coumans@gmail.com>
Date: Sat, 6 Jul 2013 14:11:32 -0700
Subject: [PATCH] joint (non-contact constraint) solver iterations is now
 working on GPU, but overall slower because of data copy. Will move joint
 setup to GPU, and then some benefit should be visible. Don't use 64
 alignment, it causes data structures size mismatch between cpu and gpu

---
 .../ConstraintSolver/b3SolverBody.h           |   5 +-
 .../RigidBody/b3GpuPgsJacobiSolver.cpp        | 102 ++++++++++++++----
 .../RigidBody/b3GpuRigidBodyPipeline.cpp      |   4 +-
 .../RigidBody/kernels/jointSolver.cl          |  10 +-
 .../RigidBody/kernels/jointSolver.h           |  10 +-
 5 files changed, 96 insertions(+), 35 deletions(-)
diff --git a/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h b/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h
index b25c6c085..8ec5ec7aa 100644
--- a/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h
+++ b/src/Bullet3Dynamics/ConstraintSolver/b3SolverBody.h
@@ -51,6 +51,7 @@ struct	b3SimdScalar
 	{
 		__m128		m_vec128;
 		float		m_floats[4];
+		float		x,y,z,w;
 		int			m_ints[4];
 		b3Scalar	m_unusedPadding;
 	};
@@ -105,7 +106,7 @@ operator+(const b3SimdScalar& v1, const b3SimdScalar& v2)
 #endif
 
 ///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
-B3_ATTRIBUTE_ALIGNED64 (struct)	b3SolverBody
+B3_ATTRIBUTE_ALIGNED16 (struct)	b3SolverBody
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();
 	b3Transform		m_worldTransform;
@@ -125,6 +126,8 @@ B3_ATTRIBUTE_ALIGNED64 (struct)	b3SolverBody
 		int		m_originalBodyIndex;
 	};
 
+	int padding[3];
+
 
 	void	setWorldTransform(const b3Transform& worldTransform)
 	{
diff --git a/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp b/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp
index f964a61ae..76a759f70 100644
--- a/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuPgsJacobiSolver.cpp
@@ -25,6 +25,10 @@ struct b3GpuPgsJacobiSolverInternalData
 	cl_device_id m_device;
 	cl_command_queue m_queue;
 	cl_kernel m_solveJointConstraintRowsKernels;
+	b3OpenCLArray<b3SolverConstraint>*		m_gpuSolverConstraintRows;
+	b3OpenCLArray<b3SolverBody>*			m_gpuSolverBodies;
+	b3OpenCLArray<b3BatchConstraint>*		m_gpuBatchConstraints;
+	b3OpenCLArray<b3SolverConstraint>*		m_gpuConstraintRows;
 
 
 };
@@ -36,6 +40,12 @@ b3GpuPgsJacobiSolver::b3GpuPgsJacobiSolver (cl_context ctx, cl_device_id device,
 	m_gpuData->m_device = device;
 	m_gpuData->m_queue = queue;
 
+	m_gpuData->m_gpuSolverConstraintRows = new b3OpenCLArray<b3SolverConstraint>(ctx,queue);
+	m_gpuData->m_gpuSolverBodies = new b3OpenCLArray<b3SolverBody>(m_gpuData->m_context,m_gpuData->m_queue);
+	m_gpuData->m_gpuBatchConstraints = new b3OpenCLArray<b3BatchConstraint>(m_gpuData->m_context,m_gpuData->m_queue);
+	m_gpuData->m_gpuConstraintRows = new b3OpenCLArray<b3SolverConstraint>(m_gpuData->m_context,m_gpuData->m_queue);
+
+
 	cl_int errNum=0;
 
 	{
@@ -53,6 +63,11 @@ b3GpuPgsJacobiSolver::~b3GpuPgsJacobiSolver ()
 {
 	clReleaseKernel(m_gpuData->m_solveJointConstraintRowsKernels);
 
+	delete m_gpuData->m_gpuSolverConstraintRows;
+	delete m_gpuData->m_gpuSolverBodies;
+	delete m_gpuData->m_gpuBatchConstraints;
+	delete m_gpuData->m_gpuConstraintRows;
+
 	delete m_gpuData;
 }
 
@@ -419,16 +434,51 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlySetup(b3RigidBodyCL* bodie
 
 
 
+///a straight copy from GPU/OpenCL kernel, for debugging
+__inline void internalApplyImpulse( b3SolverBody* body,  const b3Vector3& linearComponent, const b3Vector3& angularComponent,float impulseMagnitude)
+{
+	body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;
+	body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);
+}
 
 
+void resolveSingleConstraintRowGeneric2( b3SolverBody* body1,  b3SolverBody* body2,  b3SolverConstraint* c)
+{
+	float deltaImpulse = c->m_rhs-c->m_appliedImpulse.x*c->m_cfm;
+	float deltaVel1Dotn	=	b3Dot(c->m_contactNormal,body1->m_deltaLinearVelocity) 	+ b3Dot(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);
+	float deltaVel2Dotn	=	-b3Dot(c->m_contactNormal,body2->m_deltaLinearVelocity) + b3Dot(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);
+
+	deltaImpulse	-=	deltaVel1Dotn*c->m_jacDiagABInv;
+	deltaImpulse	-=	deltaVel2Dotn*c->m_jacDiagABInv;
+
+	float sum = c->m_appliedImpulse.x + deltaImpulse;
+	if (sum < c->m_lowerLimit)
+	{
+		deltaImpulse = c->m_lowerLimit-c->m_appliedImpulse.x;
+		c->m_appliedImpulse.x = c->m_lowerLimit;
+	}
+	else if (sum > c->m_upperLimit) 
+	{
+		deltaImpulse = c->m_upperLimit-c->m_appliedImpulse.x;
+		c->m_appliedImpulse.x = c->m_upperLimit;
+	}
+	else
+	{
+		c->m_appliedImpulse.x = sum;
+	}
+
+	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);
+	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);
+
+}
 
 
 
 b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstraint** cpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal)
 {
-	bool useCpu = false;
+	bool useb3PgsJacobiSolver = false;
 	bool createBatches = batches.size()==0;
-	if (useCpu)
+	if (useb3PgsJacobiSolver)
 	{
 		return b3PgsJacobiSolver::solveGroupCacheFriendlyIterations(cpuConstraints,numConstraints,infoGlobal);
 	} else
@@ -449,20 +499,19 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai
 		}
 		int maxIterations = infoGlobal.m_numIterations;
 		bool useBatching = true;
+		bool useGpu=false;
+
 		if (useBatching )
 		{
-			b3OpenCLArray<b3SolverConstraint> gpuSolverConstraintRows(m_gpuData->m_context,m_gpuData->m_queue);
-			gpuSolverConstraintRows.copyFromHost(m_tmpSolverNonContactConstraintPool);
-
-			b3OpenCLArray<b3SolverBody> gpuSolverBodies(m_gpuData->m_context,m_gpuData->m_queue);
-			gpuSolverBodies.copyFromHost(m_tmpSolverBodyPool);
-//			gpuSolverBodies.copyToHost(m_tmpSolverBodyPool);
+			if (useGpu)
+			{
+				B3_PROFILE("copy from host");
+				m_gpuData->m_gpuSolverConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool);
+				m_gpuData->m_gpuSolverBodies->copyFromHost(m_tmpSolverBodyPool);
+				m_gpuData->m_gpuBatchConstraints->copyFromHost(batchConstraints);
+				m_gpuData->m_gpuConstraintRows->copyFromHost(m_tmpSolverNonContactConstraintPool);
+			}
 			
-			b3OpenCLArray<b3BatchConstraint> gpuBatchConstraints(m_gpuData->m_context,m_gpuData->m_queue);
-			gpuBatchConstraints.copyFromHost(batchConstraints);
-			
-			b3OpenCLArray<b3SolverConstraint> gpuConstraintRows(m_gpuData->m_context,m_gpuData->m_queue);
-			gpuConstraintRows.copyFromHost(m_tmpSolverNonContactConstraintPool);
 			
 
 			for ( int iteration = 0 ; iteration< maxIterations ; iteration++)
@@ -475,18 +524,20 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai
 				{
 					int numConstraintsInBatch = batches[bb];
 
-					bool useGpu=false;
+					
 					if (useGpu)
 					{
+						B3_PROFILE("b3LauncherCL");
 						b3LauncherCL launcher(m_gpuData->m_queue,m_gpuData->m_solveJointConstraintRowsKernels);
-						launcher.setBuffer(gpuSolverBodies.getBufferCL());
-						launcher.setBuffer(gpuBatchConstraints.getBufferCL());
-						launcher.setBuffer(gpuConstraintRows.getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuSolverBodies->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuBatchConstraints->getBufferCL());
+						launcher.setBuffer(m_gpuData->m_gpuConstraintRows->getBufferCL());
 						launcher.setConst(batchOffset);
 						launcher.setConst(constraintOffset);
 						launcher.setConst(numConstraintsInBatch);
 
 						launcher.launch1D(numConstraintsInBatch);
+						clFinish(m_gpuData->m_queue);
 
 					} else
 					{
@@ -503,8 +554,8 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai
 							{
 //							
 								b3SolverConstraint& constraint = m_tmpSolverNonContactConstraintPool[c.m_constraintRowOffset+jj];
-//								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
-								resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
+								resolveSingleConstraintRowGenericSIMD(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
+								//resolveSingleConstraintRowGeneric(m_tmpSolverBodyPool[constraint.m_solverBodyIdA],m_tmpSolverBodyPool[constraint.m_solverBodyIdB],constraint);
 
 							}
 						}
@@ -514,9 +565,16 @@ b3Scalar b3GpuPgsJacobiSolver::solveGroupCacheFriendlyIterations(b3TypedConstrai
 				}
 			}//for (int iteration...
 
-			gpuSolverBodies.copyToHost(m_tmpSolverBodyPool);
-			clFinish(m_gpuData->m_queue);
-			printf(",,\n");
+			if (useGpu)
+			{
+				B3_PROFILE("copy to host");
+				m_gpuData->m_gpuSolverBodies->copyToHost(m_tmpSolverBodyPool);
+			}
+			//int sz = sizeof(b3SolverBody);
+			//printf("cpu sizeof(b3SolverBody)=%d\n",sz);
+
+			
+			
 
 
 		} else
diff --git a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
index 0fa1f26ec..38eb0ab3c 100644
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
@@ -237,8 +237,8 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 			m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints);
 
 		}
-		//gpuBodies.copyFromHost(hostBodies);
-		printf("...\n");
+		gpuBodies.copyFromHost(hostBodies);
+		
 	}
 
 	if (numContacts)
diff --git a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl
index 6a197b477..eb3f6040b 100644
--- a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.cl
@@ -1,4 +1,5 @@
 
+
 typedef float4 Quaternion;
 
 typedef struct
@@ -38,6 +39,8 @@ typedef struct
 		void*	m_originalBody;
 		int		m_originalBodyIndex;
 	};
+	int padding[3];
+
 } b3SolverBody;
 
 
@@ -131,11 +134,8 @@ void resolveSingleConstraintRowGeneric(__global b3SolverBody* body1, __global b3
 		c->m_appliedImpulse.x = sum;
 	}
 
-	if (body1->m_invMass.x)
-		internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);
-	
-	if (body2->m_invMass.x)
-		internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);
+	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);
+	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);
 
 }
 
diff --git a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
index 1a25292f8..d43d2979b 100644
--- a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
@@ -1,6 +1,7 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
 static const char* solveConstraintRowsCL= \
 "\n"
+"\n"
 "typedef float4 Quaternion;\n"
 "\n"
 "typedef struct\n"
@@ -40,6 +41,8 @@ static const char* solveConstraintRowsCL= \
 "		void*	m_originalBody;\n"
 "		int		m_originalBodyIndex;\n"
 "	};\n"
+"	int padding[3];\n"
+"\n"
 "} b3SolverBody;\n"
 "\n"
 "\n"
@@ -133,11 +136,8 @@ static const char* solveConstraintRowsCL= \
 "		c->m_appliedImpulse.x = sum;\n"
 "	}\n"
 "\n"
-"	if (body1->m_invMass.x)\n"
-"		internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n"
-"	\n"
-"	if (body2->m_invMass.x)\n"
-"		internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n"
+"	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n"
+"	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n"
 "\n"
 "}\n"
 "\n"