share more data structures and code between OpenCL and C/C++ on CPU:

move the setConstraint4/b3ConvertConstraint4 to shared code.
2013-11-07 12:46:01 -08:00
parent a9a758dd54
commit e20cb22832
16 changed files with 1249 additions and 908 deletions
--- a/src/Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h
+++ b/src/Bullet3Collision/NarrowPhaseCollision/b3RigidBodyCL.h
@@ -34,11 +34,9 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3RigidBodyCL : public b3RigidBodyData
 };


-struct b3InertiaCL
+struct b3InertiaCL : public b3InertiaData
 {
-	b3Matrix3x3 m_invInertiaWorld;
-	b3Matrix3x3 m_initInvInertia;
+
 };

-
 #endif//B3_RIGID_BODY_CL
--- a/src/Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h
+++ b/src/Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h
@@ -3,6 +3,7 @@

 #include "Bullet3Common/shared/b3Float4.h"
 #include "Bullet3Common/shared/b3Quat.h"
+#include "Bullet3Common/shared/b3Mat3x3.h"

 typedef struct b3RigidBodyData b3RigidBodyData_t;

@@ -19,7 +20,15 @@ struct b3RigidBodyData
 	float 				m_restituitionCoeff;
 	float 				m_frictionCoeff;
 };
-	
+
+typedef struct b3InertiaData b3InertiaData_t;
+
+struct b3InertiaData
+{
+	b3Mat3x3 m_invInertiaWorld;
+	b3Mat3x3 m_initInvInertia;
+};
+

 #endif //B3_RIGIDBODY_DATA_H
 	
--- a/src/Bullet3Common/shared/b3Mat3x3.h
+++ b/src/Bullet3Common/shared/b3Mat3x3.h
@@ -24,11 +24,18 @@ inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg mat)

 #define b3GetRow(m,row) m.getRow(row)

+__inline
+b3Float4 mtMul3(b3Float4ConstArg a, b3Mat3x3ConstArg b)
+{
+	return b*a;
+}
+
+
 #else

 typedef struct
 {
-	float4 m_row[3];
+	b3Float4 m_row[3];
 }b3Mat3x3;

 #define b3Mat3x3ConstArg const b3Mat3x3
@@ -36,7 +43,7 @@ typedef struct

 inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)
 {
-	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
+	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
 	b3Mat3x3 out;

 	out.m_row[0].x=1-2*quat2.y-2*quat2.z;
@@ -66,10 +73,107 @@ inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)
 	return out;
 }

+
+__inline
+b3Mat3x3 mtZero();
+
+__inline
+b3Mat3x3 mtIdentity();
+
+__inline
+b3Mat3x3 mtTranspose(b3Mat3x3 m);
+
+__inline
+b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);
+
+__inline
+b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);
+
+__inline
+b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);
+
+__inline
+b3Mat3x3 mtZero()
+{
+	b3Mat3x3 m;
+	m.m_row[0] = (b3Float4)(0.f);
+	m.m_row[1] = (b3Float4)(0.f);
+	m.m_row[2] = (b3Float4)(0.f);
+	return m;
+}
+
+__inline
+b3Mat3x3 mtIdentity()
+{
+	b3Mat3x3 m;
+	m.m_row[0] = (b3Float4)(1,0,0,0);
+	m.m_row[1] = (b3Float4)(0,1,0,0);
+	m.m_row[2] = (b3Float4)(0,0,1,0);
+	return m;
+}
+
+__inline
+b3Mat3x3 mtTranspose(b3Mat3x3 m)
+{
+	b3Mat3x3 out;
+	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+}
+
+__inline
+b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)
+{
+	b3Mat3x3 transB;
+	transB = mtTranspose( b );
+	b3Mat3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for(int i=0; i<3; i++)
+	{
+//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+}
+
+__inline
+b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)
+{
+	b3Float4 ans;
+	ans.x = b3Dot3F4( a.m_row[0], b );
+	ans.y = b3Dot3F4( a.m_row[1], b );
+	ans.z = b3Dot3F4( a.m_row[2], b );
+	ans.w = 0.f;
+	return ans;
+}
+
+__inline
+b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)
+{
+	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
+	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
+	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
+
+	b3Float4 ans;
+	ans.x = b3Dot3F4( a, colx );
+	ans.y = b3Dot3F4( a, coly );
+	ans.z = b3Dot3F4( a, colz );
+	return ans;
+}
+
+
 #endif





+
 #endif //B3_MAT3x3_H
--- a/src/Bullet3Common/shared/b3Quat.h
+++ b/src/Bullet3Common/shared/b3Quat.h
@@ -46,7 +46,18 @@ inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)

 inline b3Quat b3QuatNormalize(b3QuatConstArg in)
 {
-	return b3FastNormalize4(in);
+	//return b3FastNormalize4(in);
+	float len = native_sqrt(dot(q, q));
+	if(len > 0.f)
+	{
+		q *= 1.f / len;
+	}
+	else
+	{
+		q.x = q.y = q.z = 0.f;
+		q.w = 1.f;
+	}
+	return q;
 }
 inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)
 {
--- a/src/Bullet3Dynamics/shared/b3ContactConstraint4.h
+++ b/src/Bullet3Dynamics/shared/b3ContactConstraint4.h
@@ -5,6 +5,7 @@

 typedef struct b3ContactConstraint4 b3ContactConstraint4_t;

+
 struct b3ContactConstraint4
 {

@@ -25,9 +26,9 @@ struct b3ContactConstraint4
 };

 //inline	void setFrictionCoeff(float value) { m_linear[3] = value; }
-inline	float b3GetFrictionCoeff(b3ContactConstraint4* constraint) 
+inline	float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) 
 {
-	return constraint->m_linear[3]; 
+	return constraint->m_linear.w; 
 }

 #endif //B3_CONTACT_CONSTRAINT5_H
--- a/src/Bullet3Dynamics/shared/b3ConvertConstraint4.h
+++ b/src/Bullet3Dynamics/shared/b3ConvertConstraint4.h
@@ -0,0 +1,153 @@
+
+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
+
+void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);
+ void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)
+{
+  if (b3Fabs(n.z) > 0.70710678f) {
+    // choose p in y-z plane
+    float a = n.y*n.y + n.z*n.z;
+    float k = 1.f/sqrt(a);
+    p[0].x = 0;
+	p[0].y = -n.z*k;
+	p[0].z = n.y*k;
+    // set q = n x p
+    q[0].x = a*k;
+	q[0].y = -n.x*p[0].z;
+	q[0].z = n.x*p[0].y;
+  }
+  else {
+    // choose p in x-y plane
+    float a = n.x*n.x + n.y*n.y;
+    float k = 1.f/sqrt(a);
+    p[0].x = -n.y*k;
+	p[0].y = n.x*k;
+	p[0].z = 0;
+    // set q = n x p
+    q[0].x = -n.z*p[0].y;
+	q[0].y = n.z*p[0].x;
+	q[0].z = a*k;
+  }
+}
+
+
+ 
+void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)
+{
+	*linear = b3MakeFloat4(n.x,n.y,n.z,0.f);
+	*angular0 = b3Cross3(r0, n);
+	*angular1 = -b3Cross3(r1, n);
+}
+
+
+float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,
+	b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )
+{
+	return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);
+}
+
+
+float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,
+					float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)
+{
+	//	linear0,1 are normlized
+	float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;
+	float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);
+	float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;
+	float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);
+	return -1.f/(jmj0+jmj1+jmj2+jmj3);
+}
+
+
+void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,
+	b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, 
+	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,
+	b3ContactConstraint4_t* dstC )
+{
+	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
+	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
+
+	float dtInv = 1.f/dt;
+	for(int ic=0; ic<4; ic++)
+	{
+		dstC->m_appliedRambdaDt[ic] = 0.f;
+	}
+	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
+
+
+	dstC->m_linear = src->m_worldNormalOnB;
+	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );
+	for(int ic=0; ic<4; ic++)
+	{
+		b3Float4 r0 = src->m_worldPosB[ic] - posA;
+		b3Float4 r1 = src->m_worldPosB[ic] - posB;
+
+		if( ic >= src->m_worldNormalOnB.w )//npoints
+		{
+			dstC->m_jacCoeffInv[ic] = 0.f;
+			continue;
+		}
+
+		float relVelN;
+		{
+			b3Float4 linear, angular0, angular1;
+			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);
+
+			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB );
+
+			relVelN = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB);
+
+			float e = 0.f;//src->getRestituitionCoeff();
+			if( relVelN*relVelN < 0.004f ) e = 0.f;
+
+			dstC->m_b[ic] = e*relVelN;
+			//float penetration = src->m_worldPosB[ic].w;
+			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;
+			dstC->m_appliedRambdaDt[ic] = 0.f;
+		}
+	}
+
+	if( src->m_worldNormalOnB.w > 0 )//npoints
+	{	//	prepare friction
+		b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);
+		for(int i=0; i<src->m_worldNormalOnB.w; i++) 
+			center += src->m_worldPosB[i];
+		center /= (float)src->m_worldNormalOnB.w;
+
+		b3Float4 tangent[2];
+		b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);
+		
+		b3Float4 r[2];
+		r[0] = center - posA;
+		r[1] = center - posB;
+
+		for(int i=0; i<2; i++)
+		{
+			b3Float4 linear, angular0, angular1;
+			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);
+
+			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+				invMassA, &invInertiaA, invMassB, &invInertiaB );
+			dstC->m_fAppliedRambdaDt[i] = 0.f;
+		}
+		dstC->m_center = center;
+	}
+
+	for(int i=0; i<4; i++)
+	{
+		if( i<src->m_worldNormalOnB.w )
+		{
+			dstC->m_worldPos[i] = src->m_worldPosB[i];
+		}
+		else
+		{
+			dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);
+		}
+	}
+}
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
@@ -224,7 +224,18 @@ static const char* satKernelsCL= \
 "}\n"
 "inline b3Quat b3QuatNormalize(b3QuatConstArg in)\n"
 "{\n"
-"	return b3FastNormalize4(in);\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
 "}\n"
 "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
 "{\n"
@@ -253,13 +264,13 @@ static const char* satKernelsCL= \
 "#else\n"
 "typedef struct\n"
 "{\n"
-"	float4 m_row[3];\n"
+"	b3Float4 m_row[3];\n"
 "}b3Mat3x3;\n"
 "#define b3Mat3x3ConstArg const b3Mat3x3\n"
 "#define b3GetRow(m,row) (m.m_row[row])\n"
 "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
 "{\n"
-"	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
 "	b3Mat3x3 out;\n"
 "	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
 "	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
@@ -283,6 +294,87 @@ static const char* satKernelsCL= \
 "	out.m_row[2] = fabs(matIn.m_row[2]);\n"
 "	return out;\n"
 "}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
 "#endif\n"
 "#endif //B3_MAT3x3_H\n"
 "typedef struct b3Aabb b3Aabb_t;\n"
--- a/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp
@@ -1,15 +1,32 @@

-bool b3GpuBatchContacts = true;
-bool b3GpuSolveConstraint = true;
-bool gpuRadixSort=true;
-bool gpuSetSortData = true;
+//#define USE_CPU
+#ifdef USE_CPU
+	bool b3GpuBatchContacts = false;
+	bool b3GpuSolveConstraint = false;
+	bool gpuRadixSort=false;
+	bool gpuSetSortData = false;

-bool optionalSortContactsDeterminism = true;
-bool gpuSortContactsDeterminism = true;
-bool useCpuCopyConstraints = false;
+	bool optionalSortContactsDeterminism = true;
+	bool gpuSortContactsDeterminism = false;
+	bool useCpuCopyConstraints = true;

-bool useScanHost = false;
-bool reorderContactsOnCpu = false;
+	bool useScanHost = true;
+	bool reorderContactsOnCpu = true;
+
+#else
+	bool b3GpuBatchContacts = true;
+	bool b3GpuSolveConstraint = true;
+	bool gpuRadixSort=true;
+	bool gpuSetSortData = true;
+
+	bool optionalSortContactsDeterminism = true;
+	bool gpuSortContactsDeterminism = true;
+	bool useCpuCopyConstraints = false;
+
+	bool useScanHost = false;
+	bool reorderContactsOnCpu = false;
+
+#endif

 #include "b3GpuBatchingPgsSolver.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
--- a/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
@@ -3,24 +3,13 @@
 #define B3_CONSTRAINT4_h
 #include "Bullet3Common/b3Vector3.h"

-B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4
+#include "Bullet3Dynamics/shared/b3ContactConstraint4.h"
+
+
+B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4 : public b3ContactConstraint4
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();

-	b3Vector3 m_linear;//normal?
-	b3Vector3 m_worldPos[4];
-	b3Vector3 m_center;	//	friction
-	float m_jacCoeffInv[4];
-	float m_b[4];
-	float m_appliedRambdaDt[4];
-	float m_fJacCoeffInv[2];	//	friction
-	float m_fAppliedRambdaDt[2];	//	friction
-
-	unsigned int m_bodyA;
-	unsigned int m_bodyB;
-	int m_batchIdx;
-	unsigned int m_paddings;
-
 	inline	void setFrictionCoeff(float value) { m_linear[3] = value; }
 	inline	float getFrictionCoeff() const { return m_linear[3]; }
 };
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
@@ -33,11 +33,21 @@ subject to the following restrictions:
 #define B3_RIGIDBODY_INTEGRATE_PATH "src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl"
 #define B3_RIGIDBODY_UPDATEAABB_PATH "src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl"

-bool useDbvt = false;//true;
-bool useBullet2CpuSolver = true;
-bool dumpContactStats = false;
-bool calcWorldSpaceAabbOnCpu = false;//true;
-bool useCalculateOverlappingPairsHost = false;
+//#define USE_CPU
+#ifdef USE_CPU
+	bool useDbvt = true;
+	bool useBullet2CpuSolver = true;
+	bool dumpContactStats = false;
+	bool calcWorldSpaceAabbOnCpu = true;
+	bool useCalculateOverlappingPairsHost = true;
+
+#else
+	bool useDbvt = false;//true;
+	bool useBullet2CpuSolver = true;
+	bool dumpContactStats = false;
+	bool calcWorldSpaceAabbOnCpu = false;//true;
+	bool useCalculateOverlappingPairsHost = false;
+#endif

 #ifdef TEST_OTHER_GPU_SOLVER
 #include "b3GpuJacobiSolver.h"
--- a/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
@@ -18,6 +18,7 @@ subject to the following restrictions:

 ///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
 bool useNewBatchingKernel = false;
+bool convertConstraintOnCpu = false;//true;

 #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
 #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
@@ -26,6 +27,7 @@ bool useNewBatchingKernel = false;
 #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
 #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"

+#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"

 #include "kernels/solverSetup.h"
 #include "kernels/solverSetup2.h"
@@ -203,103 +205,6 @@ b3Solver::~b3Solver()

 

-
-/*void b3Solver::reorderConvertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf, 
-	const b3OpenCLArray<b3InertiaCL>* shapeBuf,
-	b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, 
-	int nContacts, const b3Solver::ConstraintCfg& cfg )
-{
-	if( m_contactBuffer )
-	{
-		m_contactBuffer->resize(nContacts);
-	}
-	if( m_contactBuffer == 0 )
-	{
-		B3_PROFILE("new m_contactBuffer;");
-		m_contactBuffer = new b3OpenCLArray<b3Contact4>(m_context,m_queue,nContacts );
-		m_contactBuffer->resize(nContacts);
-	}
-	
-
-	
-
-	//DeviceUtils::Config dhCfg;
-	//Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
-	if( cfg.m_enableParallelSolve )
-	{
-		
-
-		clFinish(m_queue);
-		
-		//	contactsIn -> m_contactBuffer
-		{
-			B3_PROFILE("sortContacts");
-			sortContacts( bodyBuf, contactsIn, additionalData, nContacts, cfg );
-			clFinish(m_queue);
-		}
-		
-		
-		{
-			B3_PROFILE("m_copyConstraintKernel");
-
-			
-
-			b3Int4 cdata; cdata.x = nContacts;
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( contactsIn->getBufferCL() ) };
-//			b3LauncherCL launcher( m_queue, data->m_device->getKernel( PATH, "CopyConstraintKernel",  "-I ..\\..\\ -Wf,--c++", 0 ) );
-			b3LauncherCL launcher( m_queue, m_copyConstraintKernel );
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(  cdata );
-			launcher.launch1D( nContacts, 64 );
-			clFinish(m_queue);
-		}
-
-		{
-			B3_PROFILE("batchContacts");
-			b3Solver::batchContacts( contactsIn, nContacts, m_numConstraints, m_offsets, cfg.m_staticIdx );
-
-		}
-	}
-	{
-			B3_PROFILE("waitForCompletion (batchContacts)");
-			clFinish(m_queue);
-	}
-	
-	//================
-	
-	{
-		B3_PROFILE("convertToConstraints");
-		b3Solver::convertToConstraints(  bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg );
-	}
-
-	{
-		B3_PROFILE("convertToConstraints waitForCompletion");
-		clFinish(m_queue);
-	}
-	
-}
-*/
-
-	static
-	inline
-	float calcRelVel(const b3Vector3& l0, const b3Vector3& l1, const b3Vector3& a0, const b3Vector3& a1, 
-					 const b3Vector3& linVel0, const b3Vector3& angVel0, const b3Vector3& linVel1, const b3Vector3& angVel1)
-	{
-		return b3Dot(l0, linVel0) + b3Dot(a0, angVel0) + b3Dot(l1, linVel1) + b3Dot(a1, angVel1);
-	}
-
-
-	static
-	inline
-	void setLinearAndAngular(const b3Vector3& n, const b3Vector3& r0, const b3Vector3& r1,
-							 b3Vector3& linear, b3Vector3& angular0, b3Vector3& angular1)
-	{
-		linear = -n;
-		angular0 = -b3Cross(r0, n);
-		angular1 = b3Cross(r1, n);
-	}
-
-
 template<bool JACOBI>
 static
 __inline
@@ -323,7 +228,7 @@ void solveContact(b3GpuConstraint4& cs,
 			b3Vector3 angular0, angular1, linear;
 			b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
 			b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
-			setLinearAndAngular( (const b3Vector3 &)-cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, linear, angular0, angular1 );
+			setLinearAndAngular( (const b3Vector3 &)cs.m_linear, (const b3Vector3 &)r0, (const b3Vector3 &)r1, &linear, &angular0, &angular1 );

 			float rambdaDt = calcRelVel((const b3Vector3 &)cs.m_linear,(const b3Vector3 &) -cs.m_linear, angular0, angular1,
 				linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
@@ -407,7 +312,7 @@ void solveContact(b3GpuConstraint4& cs,
 		b3Vector3 r1 = center - posB;
 		for(int i=0; i<2; i++)
 		{
-			setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 );
+			setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );
 			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
 				linVelA, angVelA, linVelB, angVelB );
 			rambdaDt *= cs.m_fJacCoeffInv[i];
@@ -1066,28 +971,78 @@ void b3Solver::convertToConstraints( const b3OpenCLArray<b3RigidBodyCL>* bodyBuf
 	};

 	{
-		B3_PROFILE("m_contactToConstraintKernel");
+
 		CB cdata;
 		cdata.m_nContacts = nContacts;
 		cdata.m_dt = cfg.m_dt;
 		cdata.m_positionDrift = cfg.m_positionDrift;
 		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;

-		
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
-			b3BufferInfoCL( contactCOut->getBufferCL() )};
-		b3LauncherCL launcher( m_queue, m_contactToConstraintKernel );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		//launcher.setConst(  cdata );
-        
-        launcher.setConst(cdata.m_nContacts);
-		launcher.setConst(cdata.m_dt);
-		launcher.setConst(cdata.m_positionDrift);
-		launcher.setConst(cdata.m_positionConstraintCoeff);
-        
-		launcher.launch1D( nContacts, 64 );	
-		clFinish(m_queue);
+		b3AlignedObjectArray<b3RigidBodyCL> gBodies;
+		bodyBuf->copyToHost(gBodies);

+		b3AlignedObjectArray<b3Contact4> gContact;
+		contactsIn->copyToHost(gContact);
+
+		b3AlignedObjectArray<b3InertiaCL> gShapes;
+		shapeBuf->copyToHost(gShapes);
+		
+		b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
+		gConstraintOut.resize(nContacts);
+		
+		if (convertConstraintOnCpu)
+		{
+			B3_PROFILE("cpu contactToConstraintKernel");
+			for (int gIdx=0;gIdx<nContacts;gIdx++)
+			{
+				int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
+				int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
+
+				b3Float4 posA = gBodies[aIdx].m_pos;
+				b3Float4 linVelA = gBodies[aIdx].m_linVel;
+				b3Float4 angVelA = gBodies[aIdx].m_angVel;
+				float invMassA = gBodies[aIdx].m_invMass;
+				b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
+
+				b3Float4 posB = gBodies[bIdx].m_pos;
+				b3Float4 linVelB = gBodies[bIdx].m_linVel;
+				b3Float4 angVelB = gBodies[bIdx].m_angVel;
+				float invMassB = gBodies[bIdx].m_invMass;
+				b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
+
+				b3ContactConstraint4_t cs;
+
+    			setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
+					&gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
+					&cs );
+		
+				cs.m_batchIdx = gContact[gIdx].m_batchIdx;
+
+				gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
+			}
+
+			contactCOut->copyFromHost(gConstraintOut);
+
+		} else
+		{
+			B3_PROFILE("gpu m_contactToConstraintKernel");
+
+		
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( shapeBuf->getBufferCL()),
+				b3BufferInfoCL( contactCOut->getBufferCL() )};
+			b3LauncherCL launcher( m_queue, m_contactToConstraintKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			//launcher.setConst(  cdata );
+        
+			launcher.setConst(cdata.m_nContacts);
+			launcher.setConst(cdata.m_dt);
+			launcher.setConst(cdata.m_positionDrift);
+			launcher.setConst(cdata.m_positionConstraintCoeff);
+        
+			launcher.launch1D( nContacts, 64 );	
+			clFinish(m_queue);
+
+		}
 	}

 	
--- a/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl
@@ -13,6 +13,9 @@ subject to the following restrictions:
 */
 //Originally written by Erwin Coumans

+
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
+
 float4 quatMult(float4 q1, float4 q2)
 {
 	float4 q;
@@ -23,40 +26,14 @@ float4 quatMult(float4 q1, float4 q2)
 	return q;
 }

-float4 quatNorm(float4 q)
-{
-	float len = native_sqrt(dot(q, q));
-	if(len > 0.f)
-	{
-		q *= 1.f / len;
-	}
-	else
-	{
-		q.x = q.y = q.z = 0.f;
-		q.w = 1.f;
-	}
-	return q;
-}


-typedef struct
-{
-	float4 m_pos;
-	float4 m_quat;
-	float4 m_linVel;
-	float4 m_angVel;
-
-	unsigned int m_collidableIdx;
-	float m_invMass;
-	float m_restituitionCoeff;
-	float m_frictionCoeff;
-} Body;




 __kernel void 
-  integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)
+  integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)
 {
 	int nodeID = get_global_id(0);
 	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
@@ -92,7 +69,7 @@ __kernel void
 			float4 orn0 = bodies[nodeID].m_quat;

 			float4 predictedOrn = quatMult(dorn, orn0);
-			predictedOrn = quatNorm(predictedOrn);
+			predictedOrn = b3QuatNormalize(predictedOrn);
 			bodies[nodeID].m_quat=predictedOrn;
 		}

--- a/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
@@ -12,17 +12,88 @@ static const char* integrateKernelCL= \
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"float4 quatMult(float4 q1, float4 q2)\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
 "{\n"
-"	float4 q;\n"
-"	q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;\n"
-"	q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;\n"
-"	q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;\n"
-"	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; \n"
-"	return q;\n"
+"	int bla;\n"
+"};\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#define b3AtomicInc atomic_inc\n"
+"#define b3Fabs fabs\n"
+"#endif\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Float4;\n"
+"	#define b3Float4ConstArg const b3Float4\n"
+"	#define b3MakeFloat4 (float4)\n"
+"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return dot(a1, b1);\n"
+"	}\n"
+"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+"	{\n"
+"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+"		return cross(a1, b1);\n"
+"	}\n"
+"#endif \n"
+"		\n"
+"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+"{\n"
+"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+"		return false;\n"
+"	return true;\n"
 "}\n"
-"float4 quatNorm(float4 q)\n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
 "{\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalize(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"{\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"inline b3Quat b3QuatNormalize(b3QuatConstArg in)\n"
+"{\n"
+"	//return b3FastNormalize4(in);\n"
 "	float len = native_sqrt(dot(q, q));\n"
 "	if(len > 0.f)\n"
 "	{\n"
@@ -35,19 +106,184 @@ static const char* integrateKernelCL= \
 "	}\n"
 "	return q;\n"
 "}\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"}\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"{\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"}\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
 "typedef struct\n"
 "{\n"
-"	float4 m_pos;\n"
-"	float4 m_quat;\n"
-"	float4 m_linVel;\n"
-"	float4 m_angVel;\n"
-"	unsigned int m_collidableIdx;\n"
-"	float m_invMass;\n"
-"	float m_restituitionCoeff;\n"
-"	float m_frictionCoeff;\n"
-"} Body;\n"
+"	b3Float4 m_row[3];\n"
+"}b3Mat3x3;\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"{\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"}\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
+"#endif\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"{\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"};\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"{\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"};\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"float4 quatMult(float4 q1, float4 q2)\n"
+"{\n"
+"	float4 q;\n"
+"	q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;\n"
+"	q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;\n"
+"	q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;\n"
+"	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; \n"
+"	return q;\n"
+"}\n"
 "__kernel void \n"
-"  integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
+"  integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
 "{\n"
 "	int nodeID = get_global_id(0);\n"
 "	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
@@ -82,7 +318,7 @@ static const char* integrateKernelCL= \
 "			dorn.w = native_cos(fAngle * timeStep * 0.5f);\n"
 "			float4 orn0 = bodies[nodeID].m_quat;\n"
 "			float4 predictedOrn = quatMult(dorn, orn0);\n"
-"			predictedOrn = quatNorm(predictedOrn);\n"
+"			predictedOrn = b3QuatNormalize(predictedOrn);\n"
 "			bodies[nodeID].m_quat=predictedOrn;\n"
 "		}\n"
 "		//linear velocity		\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
@@ -14,7 +14,7 @@ subject to the following restrictions:
 */
 //Originally written by Takahiro Harada

-#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+#include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"

 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
@@ -171,238 +171,13 @@ float4 createEquation(const float4 a, const float4 b, const float4 c)
 	return eqn;
 }

-///////////////////////////////////////
-//	Matrix3x3
-///////////////////////////////////////
-
-typedef struct
-{
-	float4 m_row[3];
-}Matrix3x3;
-
-__inline
-Matrix3x3 mtZero();
-
-__inline
-Matrix3x3 mtIdentity();
-
-__inline
-Matrix3x3 mtTranspose(Matrix3x3 m);
-
-__inline
-Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);
-
-__inline
-float4 mtMul1(Matrix3x3 a, float4 b);
-
-__inline
-float4 mtMul3(float4 a, Matrix3x3 b);
-
-__inline
-Matrix3x3 mtZero()
-{
-	Matrix3x3 m;
-	m.m_row[0] = (float4)(0.f);
-	m.m_row[1] = (float4)(0.f);
-	m.m_row[2] = (float4)(0.f);
-	return m;
-}
-
-__inline
-Matrix3x3 mtIdentity()
-{
-	Matrix3x3 m;
-	m.m_row[0] = (float4)(1,0,0,0);
-	m.m_row[1] = (float4)(0,1,0,0);
-	m.m_row[2] = (float4)(0,0,1,0);
-	return m;
-}
-
-__inline
-Matrix3x3 mtTranspose(Matrix3x3 m)
-{
-	Matrix3x3 out;
-	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
-	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
-	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
-	return out;
-}
-
-__inline
-Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
-{
-	Matrix3x3 transB;
-	transB = mtTranspose( b );
-	Matrix3x3 ans;
-	//	why this doesn't run when 0ing in the for{}
-	a.m_row[0].w = 0.f;
-	a.m_row[1].w = 0.f;
-	a.m_row[2].w = 0.f;
-	for(int i=0; i<3; i++)
-	{
-//	a.m_row[i].w = 0.f;
-		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
-		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
-		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
-		ans.m_row[i].w = 0.f;
-	}
-	return ans;
-}
-
-__inline
-float4 mtMul1(Matrix3x3 a, float4 b)
-{
-	float4 ans;
-	ans.x = dot3F4( a.m_row[0], b );
-	ans.y = dot3F4( a.m_row[1], b );
-	ans.z = dot3F4( a.m_row[2], b );
-	ans.w = 0.f;
-	return ans;
-}
-
-__inline
-float4 mtMul3(float4 a, Matrix3x3 b)
-{
-	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);
-	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);
-	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);
-
-	float4 ans;
-	ans.x = dot3F4( a, colx );
-	ans.y = dot3F4( a, coly );
-	ans.z = dot3F4( a, colz );
-	return ans;
-}
-
-///////////////////////////////////////
-//	Quaternion
-///////////////////////////////////////
-
-typedef float4 Quaternion;
-
-__inline
-Quaternion qtMul(Quaternion a, Quaternion b);
-
-__inline
-Quaternion qtNormalize(Quaternion in);
-
-__inline
-float4 qtRotate(Quaternion q, float4 vec);
-
-__inline
-Quaternion qtInvert(Quaternion q);
-
-__inline
-Matrix3x3 qtGetRotationMatrix(Quaternion q);
-
-
-
-__inline
-Quaternion qtMul(Quaternion a, Quaternion b)
-{
-	Quaternion ans;
-	ans = cross3( a, b );
-	ans += a.w*b+b.w*a;
-//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
-	ans.w = a.w*b.w - dot3F4(a, b);
-	return ans;
-}
-
-__inline
-Quaternion qtNormalize(Quaternion in)
-{
-	return fastNormalize4(in);
-//	in /= length( in );
-//	return in;
-}
-__inline
-float4 qtRotate(Quaternion q, float4 vec)
-{
-	Quaternion qInv = qtInvert( q );
-	float4 vcpy = vec;
-	vcpy.w = 0.f;
-	float4 out = qtMul(qtMul(q,vcpy),qInv);
-	return out;
-}
-
-__inline
-Quaternion qtInvert(Quaternion q)
-{
-	return (Quaternion)(-q.xyz, q.w);
-}
-
-__inline
-float4 qtInvRotate(const Quaternion q, float4 vec)
-{
-	return qtRotate( qtInvert( q ), vec );
-}
-
-__inline
-Matrix3x3 qtGetRotationMatrix(Quaternion quat)
-{
-	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
-	Matrix3x3 out;
-
-	out.m_row[0].x=1-2*quat2.y-2*quat2.z;
-	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;
-	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;
-	out.m_row[0].w = 0.f;
-
-	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;
-	out.m_row[1].y=1-2*quat2.x-2*quat2.z;
-	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;
-	out.m_row[1].w = 0.f;
-
-	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;
-	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;
-	out.m_row[2].z=1-2*quat2.x-2*quat2.y;
-	out.m_row[2].w = 0.f;
-
-	return out;
-}
-
-


 #define WG_SIZE 64

-typedef struct
-{
-	float4 m_pos;
-	Quaternion m_quat;
-	float4 m_linVel;
-	float4 m_angVel;

-	u32 m_shapeIdx;
-	float m_invMass;
-	float m_restituitionCoeff;
-	float m_frictionCoeff;
-} Body;

-typedef struct
-{
-	Matrix3x3 m_invInertia;
-	Matrix3x3 m_initInvInertia;
-} Shape;

-typedef struct
-{
-	float4 m_linear;
-	float4 m_worldPos[4];
-	float4 m_center;	
-	float m_jacCoeffInv[4];
-	float m_b[4];
-	float m_appliedRambdaDt[4];
-
-	float m_fJacCoeffInv[2];	
-	float m_fAppliedRambdaDt[2];	
-
-	u32 m_bodyA;
-	u32 m_bodyB;
-
-	int m_batchIdx;
-	u32 m_paddings[1];
-} Constraint4;



@@ -425,31 +200,6 @@ typedef struct
 } ConstBufferBatchSolve;


-void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)
-{
-	*linear = make_float4(n.xyz,0.f);
-	*angular0 = cross3(r0, n);
-	*angular1 = -cross3(r1, n);
-}
-
-
-float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )
-{
-	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
-}
-
-
-float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,
-					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)
-{
-	//	linear0,1 are normlized
-	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
-	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);
-	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
-	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);
-	return -1.f/(jmj0+jmj1+jmj2+jmj3);
-}
-


 
@@ -470,133 +220,8 @@ typedef struct



-typedef struct
-{
-	int m_nContacts;
-	int m_staticIdx;
-	float m_scale;
-	int m_nSplit;
-} ConstBufferSSD;


-void btPlaneSpace1 (float4 n, float4* p, float4* q);
- void btPlaneSpace1 (float4 n, float4* p, float4* q)
-{
-  if (fabs(n.z) > 0.70710678f) {
-    // choose p in y-z plane
-    float a = n.y*n.y + n.z*n.z;
-    float k = 1.f/sqrt(a);
-    p[0].x = 0;
-	p[0].y = -n.z*k;
-	p[0].z = n.y*k;
-    // set q = n x p
-    q[0].x = a*k;
-	q[0].y = -n.x*p[0].z;
-	q[0].z = n.x*p[0].y;
-  }
-  else {
-    // choose p in x-y plane
-    float a = n.x*n.x + n.y*n.y;
-    float k = 1.f/sqrt(a);
-    p[0].x = -n.y*k;
-	p[0].y = n.x*k;
-	p[0].z = 0;
-    // set q = n x p
-    q[0].x = -n.z*p[0].y;
-	q[0].y = n.z*p[0].x;
-	q[0].z = a*k;
-  }
-}
-
-
-void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
-	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, 
-	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,
-	Constraint4* dstC )
-{
-	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
-	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);
-
-	float dtInv = 1.f/dt;
-	for(int ic=0; ic<4; ic++)
-	{
-		dstC->m_appliedRambdaDt[ic] = 0.f;
-	}
-	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;
-
-
-	dstC->m_linear = src->m_worldNormalOnB;
-	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );
-	for(int ic=0; ic<4; ic++)
-	{
-		float4 r0 = src->m_worldPosB[ic] - posA;
-		float4 r1 = src->m_worldPosB[ic] - posB;
-
-		if( ic >= src->m_worldNormalOnB.w )//npoints
-		{
-			dstC->m_jacCoeffInv[ic] = 0.f;
-			continue;
-		}
-
-		float relVelN;
-		{
-			float4 linear, angular0, angular1;
-			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);
-
-			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
-				invMassA, &invInertiaA, invMassB, &invInertiaB );
-
-			relVelN = calcRelVel(linear, -linear, angular0, angular1,
-				linVelA, angVelA, linVelB, angVelB);
-
-			float e = 0.f;//src->getRestituitionCoeff();
-			if( relVelN*relVelN < 0.004f ) e = 0.f;
-
-			dstC->m_b[ic] = e*relVelN;
-			//float penetration = src->m_worldPosB[ic].w;
-			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;
-			dstC->m_appliedRambdaDt[ic] = 0.f;
-		}
-	}
-
-	if( src->m_worldNormalOnB.w > 0 )//npoints
-	{	//	prepare friction
-		float4 center = make_float4(0.f);
-		for(int i=0; i<src->m_worldNormalOnB.w; i++) 
-			center += src->m_worldPosB[i];
-		center /= (float)src->m_worldNormalOnB.w;
-
-		float4 tangent[2];
-		btPlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);
-		
-		float4 r[2];
-		r[0] = center - posA;
-		r[1] = center - posB;
-
-		for(int i=0; i<2; i++)
-		{
-			float4 linear, angular0, angular1;
-			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);
-
-			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
-				invMassA, &invInertiaA, invMassB, &invInertiaB );
-			dstC->m_fAppliedRambdaDt[i] = 0.f;
-		}
-		dstC->m_center = center;
-	}
-
-	for(int i=0; i<4; i++)
-	{
-		if( i<src->m_worldNormalOnB.w )
-		{
-			dstC->m_worldPos[i] = src->m_worldPosB[i];
-		}
-		else
-		{
-			dstC->m_worldPos[i] = make_float4(0.f);
-		}
-	}
-}

 typedef struct
 {
@@ -608,7 +233,7 @@ typedef struct

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, 
+void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, 
 int nContacts,
 float dt,
 float positionDrift,
@@ -626,15 +251,15 @@ float positionConstraintCoeff
 		float4 linVelA = gBodies[aIdx].m_linVel;
 		float4 angVelA = gBodies[aIdx].m_angVel;
 		float invMassA = gBodies[aIdx].m_invMass;
-		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;
+		b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;

 		float4 posB = gBodies[bIdx].m_pos;
 		float4 linVelB = gBodies[bIdx].m_linVel;
 		float4 angVelB = gBodies[bIdx].m_angVel;
 		float invMassB = gBodies[bIdx].m_invMass;
-		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;
+		b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;

-		Constraint4 cs;
+		b3ContactConstraint4_t cs;

    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
 			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
@@ -80,6 +80,385 @@ static const char* solverSetupCL= \
 "	contact->m_worldNormalOnB.w = (float)numPoints;\n"
 "};\n"
 "#endif //B3_CONTACT4DATA_H\n"
+"#ifndef B3_CONTACT_CONSTRAINT5_H\n"
+"#define B3_CONTACT_CONSTRAINT5_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef struct b3ContactConstraint4 b3ContactConstraint4_t;\n"
+"struct b3ContactConstraint4\n"
+"{\n"
+"	b3Float4 m_linear;//normal?\n"
+"	b3Float4 m_worldPos[4];\n"
+"	b3Float4 m_center;	//	friction\n"
+"	float m_jacCoeffInv[4];\n"
+"	float m_b[4];\n"
+"	float m_appliedRambdaDt[4];\n"
+"	float m_fJacCoeffInv[2];	//	friction\n"
+"	float m_fAppliedRambdaDt[2];	//	friction\n"
+"	unsigned int m_bodyA;\n"
+"	unsigned int m_bodyB;\n"
+"	int			m_batchIdx;\n"
+"	unsigned int m_paddings;\n"
+"};\n"
+"//inline	void setFrictionCoeff(float value) { m_linear[3] = value; }\n"
+"inline	float b3GetFrictionCoeff(b3ContactConstraint4_t* constraint) \n"
+"{\n"
+"	return constraint->m_linear.w; \n"
+"}\n"
+"#endif //B3_CONTACT_CONSTRAINT5_H\n"
+"#ifndef B3_RIGIDBODY_DATA_H\n"
+"#define B3_RIGIDBODY_DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifndef B3_QUAT_H\n"
+"#define B3_QUAT_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif\n"
+"#ifndef B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"	typedef float4	b3Quat;\n"
+"	#define b3QuatConstArg const b3Quat\n"
+"	\n"
+"	\n"
+"inline float4 b3FastNormalize4(float4 v)\n"
+"{\n"
+"	v = (float4)(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"	\n"
+"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+"inline b3Quat b3QuatNormalize(b3QuatConstArg in);\n"
+"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+"{\n"
+"	b3Quat ans;\n"
+"	ans = b3Cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"inline b3Quat b3QuatNormalize(b3QuatConstArg in)\n"
+"{\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"}\n"
+"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	b3Quat qInv = b3QuatInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
+"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+"{\n"
+"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+"}\n"
+"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+"{\n"
+"	return b3QuatRotate( orientation, point ) + (translation);\n"
+"}\n"
+"	\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#define B3_MAT3x3_H\n"
+"#ifndef B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif \n"
+"#endif //B3_QUAT_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"typedef struct\n"
+"{\n"
+"	b3Float4 m_row[3];\n"
+"}b3Mat3x3;\n"
+"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+"#define b3GetRow(m,row) (m.m_row[row])\n"
+"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+"{\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+"	out.m_row[0].w = 0.f;\n"
+"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+"	out.m_row[1].w = 0.f;\n"
+"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+"	out.m_row[2].w = 0.f;\n"
+"	return out;\n"
+"}\n"
+"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
+"#endif\n"
+"#endif //B3_MAT3x3_H\n"
+"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+"struct b3RigidBodyData\n"
+"{\n"
+"	b3Float4				m_pos;\n"
+"	b3Quat					m_quat;\n"
+"	b3Float4				m_linVel;\n"
+"	b3Float4				m_angVel;\n"
+"	int 					m_collidableIdx;\n"
+"	float 				m_invMass;\n"
+"	float 				m_restituitionCoeff;\n"
+"	float 				m_frictionCoeff;\n"
+"};\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"{\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"};\n"
+"#endif //B3_RIGIDBODY_DATA_H\n"
+"	\n"
+"void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q);\n"
+" void b3PlaneSpace1 (b3Float4ConstArg n, b3Float4* p, b3Float4* q)\n"
+"{\n"
+"  if (b3Fabs(n.z) > 0.70710678f) {\n"
+"    // choose p in y-z plane\n"
+"    float a = n.y*n.y + n.z*n.z;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = 0;\n"
+"	p[0].y = -n.z*k;\n"
+"	p[0].z = n.y*k;\n"
+"    // set q = n x p\n"
+"    q[0].x = a*k;\n"
+"	q[0].y = -n.x*p[0].z;\n"
+"	q[0].z = n.x*p[0].y;\n"
+"  }\n"
+"  else {\n"
+"    // choose p in x-y plane\n"
+"    float a = n.x*n.x + n.y*n.y;\n"
+"    float k = 1.f/sqrt(a);\n"
+"    p[0].x = -n.y*k;\n"
+"	p[0].y = n.x*k;\n"
+"	p[0].z = 0;\n"
+"    // set q = n x p\n"
+"    q[0].x = -n.z*p[0].y;\n"
+"	q[0].y = n.z*p[0].x;\n"
+"	q[0].z = a*k;\n"
+"  }\n"
+"}\n"
+" \n"
+"void setLinearAndAngular( b3Float4ConstArg n, b3Float4ConstArg r0, b3Float4ConstArg r1, b3Float4* linear, b3Float4* angular0, b3Float4* angular1)\n"
+"{\n"
+"	*linear = b3MakeFloat4(n.x,n.y,n.z,0.f);\n"
+"	*angular0 = b3Cross3(r0, n);\n"
+"	*angular1 = -b3Cross3(r1, n);\n"
+"}\n"
+"float calcRelVel( b3Float4ConstArg l0, b3Float4ConstArg l1, b3Float4ConstArg a0, b3Float4ConstArg a1, b3Float4ConstArg linVel0,\n"
+"	b3Float4ConstArg angVel0, b3Float4ConstArg linVel1, b3Float4ConstArg angVel1 )\n"
+"{\n"
+"	return b3Dot3F4(l0, linVel0) + b3Dot3F4(a0, angVel0) + b3Dot3F4(l1, linVel1) + b3Dot3F4(a1, angVel1);\n"
+"}\n"
+"float calcJacCoeff(b3Float4ConstArg linear0, b3Float4ConstArg linear1, b3Float4ConstArg angular0, b3Float4ConstArg angular1,\n"
+"					float invMass0, const b3Mat3x3* invInertia0, float invMass1, const b3Mat3x3* invInertia1)\n"
+"{\n"
+"	//	linear0,1 are normlized\n"
+"	float jmj0 = invMass0;//b3Dot3F4(linear0, linear0)*invMass0;\n"
+"	float jmj1 = b3Dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
+"	float jmj2 = invMass1;//b3Dot3F4(linear1, linear1)*invMass1;\n"
+"	float jmj3 = b3Dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
+"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
+"}\n"
+"void setConstraint4( b3Float4ConstArg posA, b3Float4ConstArg linVelA, b3Float4ConstArg angVelA, float invMassA, b3Mat3x3ConstArg invInertiaA,\n"
+"	b3Float4ConstArg posB, b3Float4ConstArg linVelB, b3Float4ConstArg angVelB, float invMassB, b3Mat3x3ConstArg invInertiaB, \n"
+"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
+"	b3ContactConstraint4_t* dstC )\n"
+"{\n"
+"	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
+"	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
+"	float dtInv = 1.f/dt;\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
+"	}\n"
+"	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
+"	dstC->m_linear = src->m_worldNormalOnB;\n"
+"	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
+"	for(int ic=0; ic<4; ic++)\n"
+"	{\n"
+"		b3Float4 r0 = src->m_worldPosB[ic] - posA;\n"
+"		b3Float4 r1 = src->m_worldPosB[ic] - posB;\n"
+"		if( ic >= src->m_worldNormalOnB.w )//npoints\n"
+"		{\n"
+"			dstC->m_jacCoeffInv[ic] = 0.f;\n"
+"			continue;\n"
+"		}\n"
+"		float relVelN;\n"
+"		{\n"
+"			b3Float4 linear, angular0, angular1;\n"
+"			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n"
+"			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
+"				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
+"			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
+"				linVelA, angVelA, linVelB, angVelB);\n"
+"			float e = 0.f;//src->getRestituitionCoeff();\n"
+"			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
+"			dstC->m_b[ic] = e*relVelN;\n"
+"			//float penetration = src->m_worldPosB[ic].w;\n"
+"			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
+"			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
+"		}\n"
+"	}\n"
+"	if( src->m_worldNormalOnB.w > 0 )//npoints\n"
+"	{	//	prepare friction\n"
+"		b3Float4 center = b3MakeFloat4(0.f,0.f,0.f,0.f);\n"
+"		for(int i=0; i<src->m_worldNormalOnB.w; i++) \n"
+"			center += src->m_worldPosB[i];\n"
+"		center /= (float)src->m_worldNormalOnB.w;\n"
+"		b3Float4 tangent[2];\n"
+"		b3PlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n"
+"		\n"
+"		b3Float4 r[2];\n"
+"		r[0] = center - posA;\n"
+"		r[1] = center - posB;\n"
+"		for(int i=0; i<2; i++)\n"
+"		{\n"
+"			b3Float4 linear, angular0, angular1;\n"
+"			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
+"			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
+"				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
+"			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
+"		}\n"
+"		dstC->m_center = center;\n"
+"	}\n"
+"	for(int i=0; i<4; i++)\n"
+"	{\n"
+"		if( i<src->m_worldNormalOnB.w )\n"
+"		{\n"
+"			dstC->m_worldPos[i] = src->m_worldPosB[i];\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			dstC->m_worldPos[i] = b3MakeFloat4(0.f,0.f,0.f,0.f);\n"
+"		}\n"
+"	}\n"
+"}\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
@@ -208,197 +587,9 @@ static const char* solverSetupCL= \
 "	eqn.w = -dot3F4(eqn,a);\n"
 "	return eqn;\n"
 "}\n"
-"///////////////////////////////////////\n"
-"//	Matrix3x3\n"
-"///////////////////////////////////////\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_row[3];\n"
-"}Matrix3x3;\n"
-"__inline\n"
-"Matrix3x3 mtZero();\n"
-"__inline\n"
-"Matrix3x3 mtIdentity();\n"
-"__inline\n"
-"Matrix3x3 mtTranspose(Matrix3x3 m);\n"
-"__inline\n"
-"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
-"__inline\n"
-"float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"__inline\n"
-"float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"__inline\n"
-"Matrix3x3 mtZero()\n"
-"{\n"
-"	Matrix3x3 m;\n"
-"	m.m_row[0] = (float4)(0.f);\n"
-"	m.m_row[1] = (float4)(0.f);\n"
-"	m.m_row[2] = (float4)(0.f);\n"
-"	return m;\n"
-"}\n"
-"__inline\n"
-"Matrix3x3 mtIdentity()\n"
-"{\n"
-"	Matrix3x3 m;\n"
-"	m.m_row[0] = (float4)(1,0,0,0);\n"
-"	m.m_row[1] = (float4)(0,1,0,0);\n"
-"	m.m_row[2] = (float4)(0,0,1,0);\n"
-"	return m;\n"
-"}\n"
-"__inline\n"
-"Matrix3x3 mtTranspose(Matrix3x3 m)\n"
-"{\n"
-"	Matrix3x3 out;\n"
-"	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
-"	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
-"	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
-"	return out;\n"
-"}\n"
-"__inline\n"
-"Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
-"{\n"
-"	Matrix3x3 transB;\n"
-"	transB = mtTranspose( b );\n"
-"	Matrix3x3 ans;\n"
-"	//	why this doesn't run when 0ing in the for{}\n"
-"	a.m_row[0].w = 0.f;\n"
-"	a.m_row[1].w = 0.f;\n"
-"	a.m_row[2].w = 0.f;\n"
-"	for(int i=0; i<3; i++)\n"
-"	{\n"
-"//	a.m_row[i].w = 0.f;\n"
-"		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);\n"
-"		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);\n"
-"		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);\n"
-"		ans.m_row[i].w = 0.f;\n"
-"	}\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"float4 mtMul1(Matrix3x3 a, float4 b)\n"
-"{\n"
-"	float4 ans;\n"
-"	ans.x = dot3F4( a.m_row[0], b );\n"
-"	ans.y = dot3F4( a.m_row[1], b );\n"
-"	ans.z = dot3F4( a.m_row[2], b );\n"
-"	ans.w = 0.f;\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"float4 mtMul3(float4 a, Matrix3x3 b)\n"
-"{\n"
-"	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
-"	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
-"	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"	float4 ans;\n"
-"	ans.x = dot3F4( a, colx );\n"
-"	ans.y = dot3F4( a, coly );\n"
-"	ans.z = dot3F4( a, colz );\n"
-"	return ans;\n"
-"}\n"
-"///////////////////////////////////////\n"
-"//	Quaternion\n"
-"///////////////////////////////////////\n"
-"typedef float4 Quaternion;\n"
-"__inline\n"
-"Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"__inline\n"
-"Quaternion qtNormalize(Quaternion in);\n"
-"__inline\n"
-"float4 qtRotate(Quaternion q, float4 vec);\n"
-"__inline\n"
-"Quaternion qtInvert(Quaternion q);\n"
-"__inline\n"
-"Matrix3x3 qtGetRotationMatrix(Quaternion q);\n"
-"__inline\n"
-"Quaternion qtMul(Quaternion a, Quaternion b)\n"
-"{\n"
-"	Quaternion ans;\n"
-"	ans = cross3( a, b );\n"
-"	ans += a.w*b+b.w*a;\n"
-"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
-"	ans.w = a.w*b.w - dot3F4(a, b);\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"Quaternion qtNormalize(Quaternion in)\n"
-"{\n"
-"	return fastNormalize4(in);\n"
-"//	in /= length( in );\n"
-"//	return in;\n"
-"}\n"
-"__inline\n"
-"float4 qtRotate(Quaternion q, float4 vec)\n"
-"{\n"
-"	Quaternion qInv = qtInvert( q );\n"
-"	float4 vcpy = vec;\n"
-"	vcpy.w = 0.f;\n"
-"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
-"	return out;\n"
-"}\n"
-"__inline\n"
-"Quaternion qtInvert(Quaternion q)\n"
-"{\n"
-"	return (Quaternion)(-q.xyz, q.w);\n"
-"}\n"
-"__inline\n"
-"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
-"{\n"
-"	return qtRotate( qtInvert( q ), vec );\n"
-"}\n"
-"__inline\n"
-"Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n"
-"{\n"
-"	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
-"	Matrix3x3 out;\n"
-"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
-"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
-"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
-"	out.m_row[0].w = 0.f;\n"
-"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
-"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
-"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
-"	out.m_row[1].w = 0.f;\n"
-"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
-"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
-"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
-"	out.m_row[2].w = 0.f;\n"
-"	return out;\n"
-"}\n"
 "#define WG_SIZE 64\n"
 "typedef struct\n"
 "{\n"
-"	float4 m_pos;\n"
-"	Quaternion m_quat;\n"
-"	float4 m_linVel;\n"
-"	float4 m_angVel;\n"
-"	u32 m_shapeIdx;\n"
-"	float m_invMass;\n"
-"	float m_restituitionCoeff;\n"
-"	float m_frictionCoeff;\n"
-"} Body;\n"
-"typedef struct\n"
-"{\n"
-"	Matrix3x3 m_invInertia;\n"
-"	Matrix3x3 m_initInvInertia;\n"
-"} Shape;\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_linear;\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_center;	\n"
-"	float m_jacCoeffInv[4];\n"
-"	float m_b[4];\n"
-"	float m_appliedRambdaDt[4];\n"
-"	float m_fJacCoeffInv[2];	\n"
-"	float m_fAppliedRambdaDt[2];	\n"
-"	u32 m_bodyA;\n"
-"	u32 m_bodyB;\n"
-"	int m_batchIdx;\n"
-"	u32 m_paddings[1];\n"
-"} Constraint4;\n"
-"typedef struct\n"
-"{\n"
 "	int m_nConstraints;\n"
 "	int m_start;\n"
 "	int m_batchIdx;\n"
@@ -413,26 +604,6 @@ static const char* solverSetupCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
-"{\n"
-"	*linear = make_float4(n.xyz,0.f);\n"
-"	*angular0 = cross3(r0, n);\n"
-"	*angular1 = -cross3(r1, n);\n"
-"}\n"
-"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
-"{\n"
-"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
-"}\n"
-"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
-"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
-"{\n"
-"	//	linear0,1 are normlized\n"
-"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
-"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
-"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
-"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
-"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
-"}\n"
 " \n"
 "typedef struct \n"
 "{\n"
@@ -448,122 +619,13 @@ static const char* solverSetupCL= \
 "typedef struct\n"
 "{\n"
 "	int m_nContacts;\n"
-"	int m_staticIdx;\n"
-"	float m_scale;\n"
-"	int m_nSplit;\n"
-"} ConstBufferSSD;\n"
-"void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
-" void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
-"{\n"
-"  if (fabs(n.z) > 0.70710678f) {\n"
-"    // choose p in y-z plane\n"
-"    float a = n.y*n.y + n.z*n.z;\n"
-"    float k = 1.f/sqrt(a);\n"
-"    p[0].x = 0;\n"
-"	p[0].y = -n.z*k;\n"
-"	p[0].z = n.y*k;\n"
-"    // set q = n x p\n"
-"    q[0].x = a*k;\n"
-"	q[0].y = -n.x*p[0].z;\n"
-"	q[0].z = n.x*p[0].y;\n"
-"  }\n"
-"  else {\n"
-"    // choose p in x-y plane\n"
-"    float a = n.x*n.x + n.y*n.y;\n"
-"    float k = 1.f/sqrt(a);\n"
-"    p[0].x = -n.y*k;\n"
-"	p[0].y = n.x*k;\n"
-"	p[0].z = 0;\n"
-"    // set q = n x p\n"
-"    q[0].x = -n.z*p[0].y;\n"
-"	q[0].y = n.z*p[0].x;\n"
-"	q[0].z = a*k;\n"
-"  }\n"
-"}\n"
-"void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
-"	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
-"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
-"	Constraint4* dstC )\n"
-"{\n"
-"	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
-"	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
-"	float dtInv = 1.f/dt;\n"
-"	for(int ic=0; ic<4; ic++)\n"
-"	{\n"
-"		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
-"	}\n"
-"	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
-"	dstC->m_linear = src->m_worldNormalOnB;\n"
-"	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
-"	for(int ic=0; ic<4; ic++)\n"
-"	{\n"
-"		float4 r0 = src->m_worldPosB[ic] - posA;\n"
-"		float4 r1 = src->m_worldPosB[ic] - posB;\n"
-"		if( ic >= src->m_worldNormalOnB.w )//npoints\n"
-"		{\n"
-"			dstC->m_jacCoeffInv[ic] = 0.f;\n"
-"			continue;\n"
-"		}\n"
-"		float relVelN;\n"
-"		{\n"
-"			float4 linear, angular0, angular1;\n"
-"			setLinearAndAngular(src->m_worldNormalOnB, r0, r1, &linear, &angular0, &angular1);\n"
-"			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
-"				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
-"			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
-"				linVelA, angVelA, linVelB, angVelB);\n"
-"			float e = 0.f;//src->getRestituitionCoeff();\n"
-"			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
-"			dstC->m_b[ic] = e*relVelN;\n"
-"			//float penetration = src->m_worldPosB[ic].w;\n"
-"			dstC->m_b[ic] += (src->m_worldPosB[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
-"			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
-"		}\n"
-"	}\n"
-"	if( src->m_worldNormalOnB.w > 0 )//npoints\n"
-"	{	//	prepare friction\n"
-"		float4 center = make_float4(0.f);\n"
-"		for(int i=0; i<src->m_worldNormalOnB.w; i++) \n"
-"			center += src->m_worldPosB[i];\n"
-"		center /= (float)src->m_worldNormalOnB.w;\n"
-"		float4 tangent[2];\n"
-"		btPlaneSpace1(src->m_worldNormalOnB,&tangent[0],&tangent[1]);\n"
-"		\n"
-"		float4 r[2];\n"
-"		r[0] = center - posA;\n"
-"		r[1] = center - posB;\n"
-"		for(int i=0; i<2; i++)\n"
-"		{\n"
-"			float4 linear, angular0, angular1;\n"
-"			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
-"			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
-"				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
-"			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
-"		}\n"
-"		dstC->m_center = center;\n"
-"	}\n"
-"	for(int i=0; i<4; i++)\n"
-"	{\n"
-"		if( i<src->m_worldNormalOnB.w )\n"
-"		{\n"
-"			dstC->m_worldPos[i] = src->m_worldPosB[i];\n"
-"		}\n"
-"		else\n"
-"		{\n"
-"			dstC->m_worldPos[i] = make_float4(0.f);\n"
-"		}\n"
-"	}\n"
-"}\n"
-"typedef struct\n"
-"{\n"
-"	int m_nContacts;\n"
 "	float m_dt;\n"
 "	float m_positionDrift;\n"
 "	float m_positionConstraintCoeff;\n"
 "} ConstBufferCTC;\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, \n"
+"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global b3RigidBodyData_t* gBodies, __global b3InertiaData_t* gShapes, __global b3ContactConstraint4_t* gConstraintOut, \n"
 "int nContacts,\n"
 "float dt,\n"
 "float positionDrift,\n"
@@ -580,13 +642,13 @@ static const char* solverSetupCL= \
 "		float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "		float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "		float invMassA = gBodies[aIdx].m_invMass;\n"
-"		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+"		b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;\n"
 "		float4 posB = gBodies[bIdx].m_pos;\n"
 "		float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "		float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "		float invMassB = gBodies[bIdx].m_invMass;\n"
-"		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"		Constraint4 cs;\n"
+"		b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;\n"
+"		b3ContactConstraint4_t cs;\n"
 "    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
 "			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n"
 "			&cs );\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
@@ -85,7 +85,18 @@ static const char* updateAabbsKernelCL= \
 "}\n"
 "inline b3Quat b3QuatNormalize(b3QuatConstArg in)\n"
 "{\n"
-"	return b3FastNormalize4(in);\n"
+"	//return b3FastNormalize4(in);\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
 "}\n"
 "inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
 "{\n"
@@ -114,13 +125,13 @@ static const char* updateAabbsKernelCL= \
 "#else\n"
 "typedef struct\n"
 "{\n"
-"	float4 m_row[3];\n"
+"	b3Float4 m_row[3];\n"
 "}b3Mat3x3;\n"
 "#define b3Mat3x3ConstArg const b3Mat3x3\n"
 "#define b3GetRow(m,row) (m.m_row[row])\n"
 "inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
 "{\n"
-"	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
 "	b3Mat3x3 out;\n"
 "	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
 "	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
@@ -144,6 +155,87 @@ static const char* updateAabbsKernelCL= \
 "	out.m_row[2] = fabs(matIn.m_row[2]);\n"
 "	return out;\n"
 "}\n"
+"__inline\n"
+"b3Mat3x3 mtZero();\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity();\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+"__inline\n"
+"b3Mat3x3 mtZero()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(0.f);\n"
+"	m.m_row[1] = (b3Float4)(0.f);\n"
+"	m.m_row[2] = (b3Float4)(0.f);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtIdentity()\n"
+"{\n"
+"	b3Mat3x3 m;\n"
+"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+"	return m;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+"{\n"
+"	b3Mat3x3 out;\n"
+"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+"	return out;\n"
+"}\n"
+"__inline\n"
+"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Mat3x3 transB;\n"
+"	transB = mtTranspose( b );\n"
+"	b3Mat3x3 ans;\n"
+"	//	why this doesn't run when 0ing in the for{}\n"
+"	a.m_row[0].w = 0.f;\n"
+"	a.m_row[1].w = 0.f;\n"
+"	a.m_row[2].w = 0.f;\n"
+"	for(int i=0; i<3; i++)\n"
+"	{\n"
+"//	a.m_row[i].w = 0.f;\n"
+"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+"		ans.m_row[i].w = 0.f;\n"
+"	}\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+"{\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+"	ans.w = 0.f;\n"
+"	return ans;\n"
+"}\n"
+"__inline\n"
+"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+"{\n"
+"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+"	b3Float4 ans;\n"
+"	ans.x = b3Dot3F4( a, colx );\n"
+"	ans.y = b3Dot3F4( a, coly );\n"
+"	ans.z = b3Dot3F4( a, colz );\n"
+"	return ans;\n"
+"}\n"
 "#endif\n"
 "#endif //B3_MAT3x3_H\n"
 "typedef struct b3Aabb b3Aabb_t;\n"
@@ -260,6 +352,11 @@ static const char* updateAabbsKernelCL= \
 "#else\n"
 "#endif \n"
 "#endif //B3_QUAT_H\n"
+"#ifndef B3_MAT3x3_H\n"
+"#ifdef __cplusplus\n"
+"#else\n"
+"#endif\n"
+"#endif //B3_MAT3x3_H\n"
 "typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
 "struct b3RigidBodyData\n"
 "{\n"
@@ -272,7 +369,12 @@ static const char* updateAabbsKernelCL= \
 "	float 				m_restituitionCoeff;\n"
 "	float 				m_frictionCoeff;\n"
 "};\n"
-"	\n"
+"typedef struct b3InertiaData b3InertiaData_t;\n"
+"struct b3InertiaData\n"
+"{\n"
+"	b3Mat3x3 m_invInertiaWorld;\n"
+"	b3Mat3x3 m_initInvInertia;\n"
+"};\n"
 "#endif //B3_RIGIDBODY_DATA_H\n"
 "	\n"
 "void b3ComputeWorldAabb(  int bodyId, __global const b3RigidBodyData_t* bodies, __global const  b3Collidable_t* collidables, __global const  b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n"