change lcpp Lua preprocessor, to keep #defines and comments, remove empty lines

remove duplicate data in b3Contact4 (now in btContact4Data shared between CPU/C++ and OpenCL) OpenCL kernels use #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" Increase number of batches back to 250 (from 50), need to fix this hard coded number (see https://github.com/erwincoumans/bullet3/issues/12) Work towards GJK/EPA, in addition to SAT/clipping (early on)
2013-08-08 12:24:09 -07:00
parent 46a08e3282
commit 3bf003ace1
50 changed files with 920 additions and 2731 deletions
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl
@@ -13,6 +13,7 @@ subject to the following restrictions:
 */
 //Originally written by Takahiro Harada

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"

 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
@@ -64,22 +65,7 @@ typedef unsigned char u8;



-typedef struct 
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyA;//sign bit set for fixed objects
-	int m_bodyB;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-}Contact4;

 typedef struct 
 {
@@ -133,7 +119,7 @@ u32 tryWrite(__local u32* buff, int idx)
 }

 //	batching on the GPU
-__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut,
+__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
 		__global const u32* gN, __global const u32* gStart, 
 		int m_staticIdx )
 {
@@ -186,8 +172,8 @@ __kernel void CreateBatches( __global const Contact4* gConstraints, __global Con
 							int dstIdx;
 							AtomInc1( ldsRingEnd, dstIdx );
 							
-							int a = gConstraints[m_start+srcIdx].m_bodyA;
-							int b = gConstraints[m_start+srcIdx].m_bodyB;
+							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;
+							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;
 							ldsRingElem[dstIdx].m_a = (a>b)? b:a;
 							ldsRingElem[dstIdx].m_b = (a>b)? a:b;
 							ldsRingElem[dstIdx].m_idx = srcIdx;
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
@@ -2,37 +2,71 @@
 static const char* batchingKernelsCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile __global int*\n"
 "#endif\n"
-"\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,43 +80,16 @@ static const char* batchingKernelsCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
-"\n"
-"\n"
-"typedef struct \n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyA;//sign bit set for fixed objects\n"
-"	int m_bodyB;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"}Contact4;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_n;\n"
@@ -90,24 +97,19 @@ static const char* batchingKernelsCL= \
 "	int m_staticIdx;\n"
 "	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_a;\n"
 "	int m_b;\n"
 "	u32 m_idx;\n"
 "}Elem;\n"
-"\n"
 "#define STACK_SIZE (WG_SIZE*10)\n"
 "//#define STACK_SIZE (WG_SIZE)\n"
 "#define RING_SIZE 1024\n"
 "#define RING_SIZE_MASK (RING_SIZE-1)\n"
 "#define CHECK_SIZE (WG_SIZE)\n"
-"\n"
-"\n"
 "#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
 "#define RING_END ldsTmp\n"
-"\n"
 "u32 readBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -115,7 +117,6 @@ static const char* batchingKernelsCL= \
 "	int bufIdx = idx/32;\n"
 "	return buff[bufIdx] & (1<<bitIdx);\n"
 "}\n"
-"\n"
 "void writeBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -124,7 +125,6 @@ static const char* batchingKernelsCL= \
 "//	buff[bufIdx] |= (1<<bitIdx);\n"
 "	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "}\n"
-"\n"
 "u32 tryWrite(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -133,9 +133,8 @@ static const char* batchingKernelsCL= \
 "	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "	return ((ans >> bitIdx)&1) == 0;\n"
 "}\n"
-"\n"
 "//	batching on the GPU\n"
-"__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut,\n"
+"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
 "		__global const u32* gN, __global const u32* gStart, \n"
 "		int m_staticIdx )\n"
 "{\n"
@@ -148,7 +147,6 @@ static const char* batchingKernelsCL= \
 "	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
 "	__local u32 ldsGEnd;\n"
 "	__local u32 ldsDstEnd;\n"
-"\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	\n"
@@ -168,7 +166,6 @@ static const char* batchingKernelsCL= \
 "	for(int ie=0; ie<50; ie++)\n"
 "	{\n"
 "		ldsFixedBuffer[lIdx] = 0;\n"
-"\n"
 "		for(int giter=0; giter<4; giter++)\n"
 "		{\n"
 "			int ringCap = GET_RING_CAPACITY;\n"
@@ -188,8 +185,8 @@ static const char* batchingKernelsCL= \
 "							int dstIdx;\n"
 "							AtomInc1( ldsRingEnd, dstIdx );\n"
 "							\n"
-"							int a = gConstraints[m_start+srcIdx].m_bodyA;\n"
-"							int b = gConstraints[m_start+srcIdx].m_bodyB;\n"
+"							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n"
+"							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n"
 "							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
 "							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
 "							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
@@ -198,37 +195,31 @@ static const char* batchingKernelsCL= \
 "					ringCap = GET_RING_CAPACITY;\n"
 "				}\n"
 "			}\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
 "	\n"
 "			//	2. fill stack\n"
 "			__local Elem* dst = ldsRingElem;\n"
 "			if( lIdx == 0 ) RING_END = 0;\n"
-"\n"
 "			int srcIdx=lIdx;\n"
 "			int end = ldsRingEnd;\n"
-"\n"
 "			{\n"
 "				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
 "				{\n"
 "					Elem e;\n"
 "					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
 "					bool done = (srcIdx<end)?false:true;\n"
-"\n"
 "					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
 "					\n"
 "					if( !done )\n"
 "					{\n"
 "						int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
 "						int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
-"\n"
 "						if( aUsed==0 && bUsed==0 )\n"
 "						{\n"
 "							int aAvailable=1;\n"
 "							int bAvailable=1;\n"
 "							int ea = abs(e.m_a);\n"
 "							int eb = abs(e.m_b);\n"
-"\n"
 "							bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
 "							bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
 "							\n"
@@ -239,7 +230,6 @@ static const char* batchingKernelsCL= \
 "							\n"
 "							//aAvailable = aStatic? 1: aAvailable;\n"
 "							//bAvailable = bStatic? 1: bAvailable;\n"
-"\n"
 "							bool success = (aAvailable && bAvailable);\n"
 "							if(success)\n"
 "							{\n"
@@ -252,7 +242,6 @@ static const char* batchingKernelsCL= \
 "							done = success;\n"
 "						}\n"
 "					}\n"
-"\n"
 "					//	put it aside\n"
 "					if(srcIdx<end)\n"
 "					{\n"
@@ -272,7 +261,6 @@ static const char* batchingKernelsCL= \
 "							dst[dstIdx] = e;\n"
 "						}\n"
 "					}\n"
-"\n"
 "					//	if filled, flush\n"
 "					if( ldsStackEnd == STACK_SIZE )\n"
 "					{\n"
@@ -284,18 +272,14 @@ static const char* batchingKernelsCL= \
 "							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
 "						}\n"
 "						if( lIdx == 0 ) ldsStackEnd = 0;\n"
-"\n"
 "						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
 "						ldsFixedBuffer[lIdx] = 0;\n"
 "					}\n"
 "				}\n"
 "			}\n"
-"\n"
 "			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
 "		}\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
 "		{\n"
 "			int idx = m_start + ldsStackIdx[i];\n"
@@ -303,7 +287,6 @@ static const char* batchingKernelsCL= \
 "			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
 "			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
 "		}\n"
-"\n"
 "		//	in case it couldn't consume any pair. Flush them\n"
 "		//	todo. Serial batch worth while?\n"
 "		if( ldsStackEnd == 0 )\n"
@@ -318,38 +301,11 @@ static const char* batchingKernelsCL= \
 "			GROUP_LDS_BARRIER;\n"
 "			if( lIdx == 0 ) ldsRingEnd = 0;\n"
 "		}\n"
-"\n"
 "		if( lIdx == 0 ) ldsStackEnd = 0;\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		//	termination\n"
 "		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
 "			break;\n"
 "	}\n"
-"\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl
@@ -13,6 +13,7 @@ subject to the following restrictions:
 */
 //Originally written by Erwin Coumans

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"

 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
@@ -65,22 +66,7 @@ typedef unsigned char u8;



-typedef struct 
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;//sign bit set for fixed objects
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-}Contact4;

 typedef struct 
 {
@@ -102,7 +88,7 @@ typedef struct


 //	batching on the GPU
-__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )
+__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )
 {
 	int wgIdx = GET_GROUP_IDX;
 	int lIdx = GET_LOCAL_IDX;
@@ -155,13 +141,13 @@ u32 tryWrite(__local u32* buff, int idx)


 //	batching on the GPU
-__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
+__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
 {
 	int wgIdx = GET_GROUP_IDX;
 	int lIdx = GET_LOCAL_IDX;
 	const int numConstraints = gN[wgIdx];
 	const int m_start = gStart[wgIdx];
-		
+	b3Contact4Data_t tmp;
 	
 	__local u32 ldsFixedBuffer[CHECK_SIZE];
 		
@@ -173,7 +159,7 @@ __kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const
 	{
 	
 		
-		__global Contact4* cs = &gConstraints[m_start];	
+		__global struct b3Contact4Data* cs = &gConstraints[m_start];	
 	
 		
 		int numValidConstraints = 0;
@@ -214,12 +200,52 @@ __kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const

 					if (i!=numValidConstraints)
 					{
-						//btSwap(cs[i],cs[numValidConstraints]);
-						
-						Contact4 tmp = cs[i];
-						cs[i] = cs[numValidConstraints];
-						cs[numValidConstraints] = tmp;
-						
+
+//						tmp = cs[i];
+//						cs[i] = cs[numValidConstraints];
+//						cs[numValidConstraints]  = tmp;
+
+#ifdef CHECK_SIZE
+						tmp.m_worldPos[0] = cs[i].m_worldPos[0];
+						tmp.m_worldPos[1] = cs[i].m_worldPos[1];
+						tmp.m_worldPos[2] = cs[i].m_worldPos[2];
+						tmp.m_worldPos[3] = cs[i].m_worldPos[3];
+						tmp.m_worldNormal = cs[i].m_worldNormal;
+						tmp.m_restituitionCoeffCmp = cs[i].m_restituitionCoeffCmp;
+						tmp.m_frictionCoeffCmp = cs[i].m_frictionCoeffCmp;
+						tmp.m_batchIdx = cs[i].m_batchIdx;
+						tmp.m_bodyAPtrAndSignBit = cs[i].m_bodyAPtrAndSignBit;
+						tmp.m_bodyBPtrAndSignBit = cs[i].m_bodyBPtrAndSignBit;
+						tmp.m_childIndexA = cs[i].m_childIndexA;
+						tmp.m_childIndexB = cs[i].m_childIndexB;
+
+						cs[i].m_worldPos[0] = cs[numValidConstraints].m_worldPos[0];
+						cs[i].m_worldPos[1] = cs[numValidConstraints].m_worldPos[1];
+						cs[i].m_worldPos[2] = cs[numValidConstraints].m_worldPos[2];
+						cs[i].m_worldPos[3] = cs[numValidConstraints].m_worldPos[3];
+						cs[i].m_worldNormal = cs[numValidConstraints].m_worldNormal;
+						cs[i].m_restituitionCoeffCmp = cs[numValidConstraints].m_restituitionCoeffCmp;
+						cs[i].m_frictionCoeffCmp = cs[numValidConstraints].m_frictionCoeffCmp;
+						cs[i].m_batchIdx = cs[numValidConstraints].m_batchIdx;
+						cs[i].m_bodyAPtrAndSignBit = cs[numValidConstraints].m_bodyAPtrAndSignBit;
+						cs[i].m_bodyBPtrAndSignBit = cs[numValidConstraints].m_bodyBPtrAndSignBit;
+						cs[i].m_childIndexA = cs[numValidConstraints].m_childIndexA;
+						cs[i].m_childIndexB = cs[numValidConstraints].m_childIndexB;
+
+						cs[numValidConstraints].m_worldPos[0] = tmp.m_worldPos[0];
+						cs[numValidConstraints].m_worldPos[1] = tmp.m_worldPos[1];
+						cs[numValidConstraints].m_worldPos[2] = tmp.m_worldPos[2];
+						cs[numValidConstraints].m_worldPos[3] = tmp.m_worldPos[3];
+						cs[numValidConstraints].m_worldNormal = tmp.m_worldNormal;
+						cs[numValidConstraints].m_restituitionCoeffCmp = tmp.m_restituitionCoeffCmp;
+						cs[numValidConstraints].m_frictionCoeffCmp = tmp.m_frictionCoeffCmp;
+						cs[numValidConstraints].m_batchIdx = tmp.m_batchIdx;
+						cs[numValidConstraints].m_bodyAPtrAndSignBit = tmp.m_bodyAPtrAndSignBit;
+						cs[numValidConstraints].m_bodyBPtrAndSignBit = tmp.m_bodyBPtrAndSignBit;
+						cs[numValidConstraints].m_childIndexA = tmp.m_childIndexA;
+						cs[numValidConstraints].m_childIndexB = tmp.m_childIndexB;
+#endif
+
 					}

 					numValidConstraints++;
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
@@ -2,38 +2,72 @@
 static const char* batchingKernelsNewCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile __global int*\n"
 "#endif\n"
-"\n"
 "#define SIMD_WIDTH 64\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -47,43 +81,16 @@ static const char* batchingKernelsNewCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
-"\n"
-"\n"
-"typedef struct \n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;//sign bit set for fixed objects\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"}Contact4;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_n;\n"
@@ -91,20 +98,14 @@ static const char* batchingKernelsNewCL= \
 "	int m_staticIdx;\n"
 "	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_a;\n"
 "	int m_b;\n"
 "	u32 m_idx;\n"
 "}Elem;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "//	batching on the GPU\n"
-"__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
+"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
 "{\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
@@ -122,13 +123,7 @@ static const char* batchingKernelsNewCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "#define CHECK_SIZE (WG_SIZE)\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "u32 readBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -136,7 +131,6 @@ static const char* batchingKernelsNewCL= \
 "	int bufIdx = idx/32;\n"
 "	return buff[bufIdx] & (1<<bitIdx);\n"
 "}\n"
-"\n"
 "void writeBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -145,7 +139,6 @@ static const char* batchingKernelsNewCL= \
 "	buff[bufIdx] |= (1<<bitIdx);\n"
 "	//atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "}\n"
-"\n"
 "u32 tryWrite(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -154,16 +147,14 @@ static const char* batchingKernelsNewCL= \
 "	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "	return ((ans >> bitIdx)&1) == 0;\n"
 "}\n"
-"\n"
-"\n"
 "//	batching on the GPU\n"
-"__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
+"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
 "{\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	const int numConstraints = gN[wgIdx];\n"
 "	const int m_start = gStart[wgIdx];\n"
-"		\n"
+"	b3Contact4Data_t tmp;\n"
 "	\n"
 "	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
 "		\n"
@@ -175,12 +166,11 @@ static const char* batchingKernelsNewCL= \
 "	{\n"
 "	\n"
 "		\n"
-"		__global Contact4* cs = &gConstraints[m_start];	\n"
+"		__global struct b3Contact4Data* cs = &gConstraints[m_start];	\n"
 "	\n"
 "		\n"
 "		int numValidConstraints = 0;\n"
 "		int batchIdx = 0;\n"
-"\n"
 "		while( numValidConstraints < numConstraints)\n"
 "		{\n"
 "			int nCurrentBatch = 0;\n"
@@ -188,10 +178,8 @@ static const char* batchingKernelsNewCL= \
 "	\n"
 "			for(int i=0; i<CHECK_SIZE; i++) \n"
 "				ldsFixedBuffer[i] = 0;		\n"
-"\n"
 "			for(int i=numValidConstraints; i<numConstraints; i++)\n"
 "			{\n"
-"\n"
 "				int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
 "				int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
 "				int bodyA = abs(bodyAS);\n"
@@ -211,19 +199,51 @@ static const char* batchingKernelsNewCL= \
 "					{\n"
 "						writeBuf( ldsFixedBuffer, bodyB );\n"
 "					}\n"
-"\n"
 "					cs[i].m_batchIdx = batchIdx;\n"
-"\n"
 "					if (i!=numValidConstraints)\n"
 "					{\n"
-"						//btSwap(cs[i],cs[numValidConstraints]);\n"
-"						\n"
-"						Contact4 tmp = cs[i];\n"
-"						cs[i] = cs[numValidConstraints];\n"
-"						cs[numValidConstraints] = tmp;\n"
-"						\n"
+"//						tmp = cs[i];\n"
+"//						cs[i] = cs[numValidConstraints];\n"
+"//						cs[numValidConstraints]  = tmp;\n"
+"#ifdef CHECK_SIZE\n"
+"						tmp.m_worldPos[0] = cs[i].m_worldPos[0];\n"
+"						tmp.m_worldPos[1] = cs[i].m_worldPos[1];\n"
+"						tmp.m_worldPos[2] = cs[i].m_worldPos[2];\n"
+"						tmp.m_worldPos[3] = cs[i].m_worldPos[3];\n"
+"						tmp.m_worldNormal = cs[i].m_worldNormal;\n"
+"						tmp.m_restituitionCoeffCmp = cs[i].m_restituitionCoeffCmp;\n"
+"						tmp.m_frictionCoeffCmp = cs[i].m_frictionCoeffCmp;\n"
+"						tmp.m_batchIdx = cs[i].m_batchIdx;\n"
+"						tmp.m_bodyAPtrAndSignBit = cs[i].m_bodyAPtrAndSignBit;\n"
+"						tmp.m_bodyBPtrAndSignBit = cs[i].m_bodyBPtrAndSignBit;\n"
+"						tmp.m_childIndexA = cs[i].m_childIndexA;\n"
+"						tmp.m_childIndexB = cs[i].m_childIndexB;\n"
+"						cs[i].m_worldPos[0] = cs[numValidConstraints].m_worldPos[0];\n"
+"						cs[i].m_worldPos[1] = cs[numValidConstraints].m_worldPos[1];\n"
+"						cs[i].m_worldPos[2] = cs[numValidConstraints].m_worldPos[2];\n"
+"						cs[i].m_worldPos[3] = cs[numValidConstraints].m_worldPos[3];\n"
+"						cs[i].m_worldNormal = cs[numValidConstraints].m_worldNormal;\n"
+"						cs[i].m_restituitionCoeffCmp = cs[numValidConstraints].m_restituitionCoeffCmp;\n"
+"						cs[i].m_frictionCoeffCmp = cs[numValidConstraints].m_frictionCoeffCmp;\n"
+"						cs[i].m_batchIdx = cs[numValidConstraints].m_batchIdx;\n"
+"						cs[i].m_bodyAPtrAndSignBit = cs[numValidConstraints].m_bodyAPtrAndSignBit;\n"
+"						cs[i].m_bodyBPtrAndSignBit = cs[numValidConstraints].m_bodyBPtrAndSignBit;\n"
+"						cs[i].m_childIndexA = cs[numValidConstraints].m_childIndexA;\n"
+"						cs[i].m_childIndexB = cs[numValidConstraints].m_childIndexB;\n"
+"						cs[numValidConstraints].m_worldPos[0] = tmp.m_worldPos[0];\n"
+"						cs[numValidConstraints].m_worldPos[1] = tmp.m_worldPos[1];\n"
+"						cs[numValidConstraints].m_worldPos[2] = tmp.m_worldPos[2];\n"
+"						cs[numValidConstraints].m_worldPos[3] = tmp.m_worldPos[3];\n"
+"						cs[numValidConstraints].m_worldNormal = tmp.m_worldNormal;\n"
+"						cs[numValidConstraints].m_restituitionCoeffCmp = tmp.m_restituitionCoeffCmp;\n"
+"						cs[numValidConstraints].m_frictionCoeffCmp = tmp.m_frictionCoeffCmp;\n"
+"						cs[numValidConstraints].m_batchIdx = tmp.m_batchIdx;\n"
+"						cs[numValidConstraints].m_bodyAPtrAndSignBit = tmp.m_bodyAPtrAndSignBit;\n"
+"						cs[numValidConstraints].m_bodyBPtrAndSignBit = tmp.m_bodyBPtrAndSignBit;\n"
+"						cs[numValidConstraints].m_childIndexA = tmp.m_childIndexA;\n"
+"						cs[numValidConstraints].m_childIndexB = tmp.m_childIndexB;\n"
+"#endif\n"
 "					}\n"
-"\n"
 "					numValidConstraints++;\n"
 "					\n"
 "					nCurrentBatch++;\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
@@ -2,19 +2,16 @@
 static const char* integrateKernelCL= \
 "/*\n"
 "Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
 "float4 quatMult(float4 q1, float4 q2)\n"
 "{\n"
 "	float4 q;\n"
@@ -24,7 +21,6 @@ static const char* integrateKernelCL= \
 "	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; \n"
 "	return q;\n"
 "}\n"
-"\n"
 "float4 quatNorm(float4 q)\n"
 "{\n"
 "	float len = native_sqrt(dot(q, q));\n"
@@ -39,24 +35,17 @@ static const char* integrateKernelCL= \
 "	}\n"
 "	return q;\n"
 "}\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	unsigned int m_collidableIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel void \n"
 "  integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
 "{\n"
@@ -92,12 +81,10 @@ static const char* integrateKernelCL= \
 "			float4 dorn = axis;\n"
 "			dorn.w = native_cos(fAngle * timeStep * 0.5f);\n"
 "			float4 orn0 = bodies[nodeID].m_quat;\n"
-"\n"
 "			float4 predictedOrn = quatMult(dorn, orn0);\n"
 "			predictedOrn = quatNorm(predictedOrn);\n"
 "			bodies[nodeID].m_quat=predictedOrn;\n"
 "		}\n"
-"\n"
 "		//linear velocity		\n"
 "		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;\n"
 "		\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
@@ -2,56 +2,37 @@
 static const char* solveConstraintRowsCL= \
 "/*\n"
 "Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
 "#define B3_CONSTRAINT_FLAG_ENABLED 1\n"
-"\n"
 "#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3\n"
 "#define B3_GPU_FIXED_CONSTRAINT_TYPE 4\n"
-"\n"
 "#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails\n"
 "#define B3_INFINITY 1e30f\n"
-"\n"
 "#define mymake_float4 (float4)\n"
-"\n"
-"\n"
 "__inline float dot3F4(float4 a, float4 b)\n"
 "{\n"
 "	float4 a1 = mymake_float4(a.xyz,0.f);\n"
 "	float4 b1 = mymake_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -62,36 +43,28 @@ static const char* solveConstraintRowsCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertiaWorld;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} BodyInertia;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_basis;//orientation\n"
 "	float4	m_origin;//transform\n"
 "}b3Transform;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "//	b3Transform		m_worldTransformUnused;\n"
@@ -104,38 +77,30 @@ static const char* solveConstraintRowsCL= \
 "	float4		m_turnVelocity;\n"
 "	float4		m_linearVelocity;\n"
 "	float4		m_angularVelocity;\n"
-"\n"
 "	union \n"
 "	{\n"
 "		void*	m_originalBody;\n"
 "		int		m_originalBodyIndex;\n"
 "	};\n"
 "	int padding[3];\n"
-"\n"
 "} b3GpuSolverBody;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	unsigned int m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} b3RigidBodyCL;\n"
-"\n"
 "typedef struct\n"
 "{\n"
-"\n"
 "	float4		m_relpos1CrossNormal;\n"
 "	float4		m_contactNormal;\n"
-"\n"
 "	float4		m_relpos2CrossNormal;\n"
 "	//float4		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal\n"
-"\n"
 "	float4		m_angularComponentA;\n"
 "	float4		m_angularComponentB;\n"
 "	\n"
@@ -152,15 +117,11 @@ static const char* solveConstraintRowsCL= \
 "	float		m_upperLimit;\n"
 "	float		m_rhsPenetration;\n"
 "	int			m_originalConstraint;\n"
-"\n"
-"\n"
 "	int	m_overrideNumSolverIterations;\n"
 "    int			m_frictionIndex;\n"
 "	int m_solverBodyIdA;\n"
 "	int m_solverBodyIdB;\n"
-"\n"
 "} b3SolverConstraint;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_bodyAPtrAndSignBit;\n"
@@ -168,28 +129,18 @@ static const char* solveConstraintRowsCL= \
 "	int m_originalConstraintIndex;\n"
 "	int m_batchId;\n"
 "} b3BatchConstraint;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int				m_constraintType;\n"
 "	int				m_rbA;\n"
 "	int				m_rbB;\n"
 "	float			m_breakingImpulseThreshold;\n"
-"\n"
 "	float4 m_pivotInA;\n"
 "	float4 m_pivotInB;\n"
 "	Quaternion m_relTargetAB;\n"
-"\n"
 "	int	m_flags;\n"
 "	int m_padding[3];\n"
 "} b3GpuGenericConstraint;\n"
-"\n"
-"\n"
 "/*b3Transform	getWorldTransform(b3RigidBodyCL* rb)\n"
 "{\n"
 "	b3Transform newTrans;\n"
@@ -197,39 +148,25 @@ static const char* solveConstraintRowsCL= \
 "	newTrans.setRotation(rb->m_quat);\n"
 "	return newTrans;\n"
 "}*/\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	v = mymake_float4(v.xyz,0.f);\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -240,7 +177,6 @@ static const char* solveConstraintRowsCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -257,30 +193,23 @@ static const char* solveConstraintRowsCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
-"\n"
 "__inline void internalApplyImpulse(__global b3GpuSolverBody* body,  float4 linearComponent, float4 angularComponent,float impulseMagnitude)\n"
 "{\n"
 "	body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;\n"
 "	body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);\n"
 "}\n"
-"\n"
-"\n"
 "void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)\n"
 "{\n"
 "	float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;\n"
 "	float deltaVel1Dotn	=	dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) 	+ dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);\n"
 "	float deltaVel2Dotn	=	-dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);\n"
-"\n"
 "	deltaImpulse	-=	deltaVel1Dotn*c->m_jacDiagABInv;\n"
 "	deltaImpulse	-=	deltaVel2Dotn*c->m_jacDiagABInv;\n"
-"\n"
 "	float sum = c->m_appliedImpulse + deltaImpulse;\n"
 "	if (sum < c->m_lowerLimit)\n"
 "	{\n"
@@ -296,12 +225,9 @@ static const char* solveConstraintRowsCL= \
 "	{\n"
 "		c->m_appliedImpulse = sum;\n"
 "	}\n"
-"\n"
 "	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n"
 "	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n"
-"\n"
 "}\n"
-"\n"
 "__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,\n"
 "					  __global b3BatchConstraint* batchConstraints,\n"
 "					  	__global b3SolverConstraint* rows,\n"
@@ -315,7 +241,6 @@ static const char* solveConstraintRowsCL= \
 "	int b = get_global_id(0);\n"
 "	if (b>=numConstraintsInBatch)\n"
 "		return;\n"
-"\n"
 "	__global b3BatchConstraint* c = &batchConstraints[b+batchOffset];\n"
 "	int originalConstraintIndex = c->m_originalConstraintIndex;\n"
 "	if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)\n"
@@ -329,16 +254,13 @@ static const char* solveConstraintRowsCL= \
 "		}\n"
 "	}\n"
 "};\n"
-"\n"
 "__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numBodies)\n"
 "		return;\n"
-"\n"
 "	__global b3GpuSolverBody* solverBody = &solverBodies[i];\n"
 "	__global b3RigidBodyCL* bodyCL = &bodiesCL[i];\n"
-"\n"
 "	solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);\n"
 "	solverBody->m_deltaAngularVelocity  = (float4)(0.f,0.f,0.f,0.f);\n"
 "	solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n"
@@ -350,7 +272,6 @@ static const char* solveConstraintRowsCL= \
 "	solverBody->m_linearVelocity = bodyCL->m_linVel;\n"
 "	solverBody->m_angularVelocity = bodyCL->m_angVel;\n"
 "}\n"
-"\n"
 "__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)\n"
 "{\n"
 "	int cid = get_global_id(0);\n"
@@ -370,17 +291,12 @@ static const char* solveConstraintRowsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numConstraints)\n"
 "		return;\n"
-"\n"
 "	__global b3GpuGenericConstraint* constraint = &constraints[i];\n"
-"\n"
 "	switch (constraint->m_constraintType)\n"
 "	{\n"
 "		case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n"
@@ -398,7 +314,6 @@ static const char* solveConstraintRowsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, \n"
 "										__global b3BatchConstraint* batchConstraints, \n"
 "										__global b3GpuGenericConstraint* constraints,\n"
@@ -408,26 +323,18 @@ static const char* solveConstraintRowsCL= \
 "	int i = get_global_id(0);\n"
 "	if (i>=numConstraints)\n"
 "		return;\n"
-"\n"
 "	int rbA = constraints[i].m_rbA;\n"
 "	int rbB = constraints[i].m_rbB;\n"
-"\n"
 "	batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass? rbA : -rbA;\n"
 "	batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass? rbB : -rbB;\n"
 "	batchConstraints[i].m_batchId = -1;\n"
 "	batchConstraints[i].m_originalConstraintIndex = i;\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	// integrator parameters: frames per second (1/stepsize), default error\n"
 "	// reduction parameter (0..1).\n"
 "	float fps,erp;\n"
-"\n"
 "	// for the first and second body, pointers to two (linear and angular)\n"
 "	// n*3 jacobian sub matrices, stored by rows. these matrices will have\n"
 "	// been initialized to 0 on entry. if the second body is zero then the\n"
@@ -441,7 +348,6 @@ static const char* solveConstraintRowsCL= \
 "	{\n"
 "		__global float4* m_J1angularAxisFloat4;\n"
 "		__global float* m_J1angularAxis;\n"
-"\n"
 "	};\n"
 "	union\n"
 "	{\n"
@@ -455,17 +361,14 @@ static const char* solveConstraintRowsCL= \
 "	};\n"
 "	// elements to jump from one row to the next in J's\n"
 "	int rowskip;\n"
-"\n"
 "	// right hand sides of the equation J*v = c + cfm * lambda. cfm is the\n"
 "	// \"constraint force mixing\" vector. c is set to zero on entry, cfm is\n"
 "	// set to a constant value (typically very small or zero) value on entry.\n"
 "	__global float* m_constraintError;\n"
 "	__global float* cfm;\n"
-"\n"
 "	// lo and hi limits for variables (set to -/+ infinity on entry).\n"
 "	__global float* m_lowerLimit;\n"
 "	__global float* m_upperLimit;\n"
-"\n"
 "	// findex vector for variables. see the LCP solver interface for a\n"
 "	// description of what this does. this is set to -1 on entry.\n"
 "	// note that the returned indexes are relative to the first index of\n"
@@ -473,39 +376,28 @@ static const char* solveConstraintRowsCL= \
 "	__global int *findex;\n"
 "	// number of solver iterations\n"
 "	int m_numIterations;\n"
-"\n"
 "	//damping of the velocity\n"
 "	float	m_damping;\n"
 "} b3GpuConstraintInfo2;\n"
-"\n"
-"\n"
 "void	getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)\n"
 "{\n"
 "	*v0 = (float4)(0.		,-vecIn.z		,vecIn.y,0.f);\n"
 "	*v1 = (float4)(vecIn.z	,0.			,-vecIn.x,0.f);\n"
 "	*v2 = (float4)(-vecIn.y	,vecIn.x	,0.f,0.f);\n"
 "}\n"
-"\n"
-"\n"
 "void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)\n"
 "{\n"
 "	float4 posA = bodies[constraint->m_rbA].m_pos;\n"
 "	Quaternion rotA = bodies[constraint->m_rbA].m_quat;\n"
-"\n"
 "	float4 posB = bodies[constraint->m_rbB].m_pos;\n"
 "	Quaternion rotB = bodies[constraint->m_rbB].m_quat;\n"
-"\n"
-"\n"
-"\n"
 "		// anchor points in global coordinates with respect to body PORs.\n"
 "   \n"
 "    // set jacobian\n"
 "    info->m_J1linearAxis[0] = 1;\n"
 "	info->m_J1linearAxis[info->rowskip+1] = 1;\n"
 "	info->m_J1linearAxis[2*info->rowskip+2] = 1;\n"
-"\n"
 "	float4 a1 = qtRotate(rotA,constraint->m_pivotInA);\n"
-"\n"
 "	{\n"
 "		__global float4* angular0 = (__global float4*)(info->m_J1angularAxis);\n"
 "		__global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);\n"
@@ -533,18 +425,15 @@ static const char* solveConstraintRowsCL= \
 "    // set right hand side\n"
 "//	float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;\n"
 "	float currERP = info->erp;\n"
-"\n"
 "	float k = info->fps * currERP;\n"
 "    int j;\n"
 "	float4 result = a2 + posB - a1 - posA;\n"
 "	float* resultPtr = &result;\n"
-"\n"
 "	for (j=0; j<3; j++)\n"
 "    {\n"
 "        info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);\n"
 "    }\n"
 "}\n"
-"\n"
 "Quaternion nearest( Quaternion first, Quaternion qd)\n"
 "{\n"
 "	Quaternion diff,sum;\n"
@@ -555,7 +444,6 @@ static const char* solveConstraintRowsCL= \
 "		return qd;\n"
 "	return (-qd);\n"
 "}\n"
-"\n"
 "float b3Acos(float x) \n"
 "{ \n"
 "	if (x<-1)	\n"
@@ -564,7 +452,6 @@ static const char* solveConstraintRowsCL= \
 "		x=1;\n"
 "	return acos(x); \n"
 "}\n"
-"\n"
 "float getAngle(Quaternion orn)\n"
 "{\n"
 "	if (orn.w>=1.f)\n"
@@ -572,7 +459,6 @@ static const char* solveConstraintRowsCL= \
 "	float s = 2.f * b3Acos(orn.w);\n"
 "	return s;\n"
 "}\n"
-"\n"
 "void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)\n"
 "{\n"
 "	Quaternion orn1 = nearest(orn0,orn1a);\n"
@@ -588,17 +474,12 @@ static const char* solveConstraintRowsCL= \
 "	else\n"
 "		*axis /= sqrt(len);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)\n"
 "{\n"
 "	Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;\n"
 "	Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;\n"
-"\n"
 "	int s = info->rowskip;\n"
 "	int start_index = start_row * s;\n"
-"\n"
 "	// 3 rows to make body rotations equal\n"
 "	info->m_J1angularAxis[start_index] = 1;\n"
 "	info->m_J1angularAxis[start_index + s + 1] = 1;\n"
@@ -626,16 +507,12 @@ static const char* solveConstraintRowsCL= \
 "        info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];\n"
 "    }\n"
 "	\n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numBodies)\n"
 "		return;\n"
-"\n"
 "	if (bodies[i].m_invMass)\n"
 "	{\n"
 "//		if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)\n"
@@ -648,8 +525,6 @@ static const char* solveConstraintRowsCL= \
 "		} \n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, \n"
 "							__global unsigned int* infos, \n"
 "							__global unsigned int* constraintRowOffsets, \n"
@@ -665,7 +540,6 @@ static const char* solveConstraintRowsCL= \
 "							int globalNumIterations,\n"
 "							int numConstraints)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numConstraints)\n"
 "		return;\n"
@@ -675,17 +549,12 @@ static const char* solveConstraintRowsCL= \
 "			\n"
 "	__global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];\n"
 "	__global b3GpuGenericConstraint* constraint = &constraints[i];\n"
-"\n"
 "	__global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];\n"
 "	__global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];\n"
-"\n"
 "	int solverBodyIdA = constraint->m_rbA;\n"
 "	int solverBodyIdB = constraint->m_rbB;\n"
-"\n"
 "	__global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];\n"
 "	__global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];\n"
-"\n"
-"\n"
 "	if (rbA->m_invMass)\n"
 "	{\n"
 "		batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;\n"
@@ -695,7 +564,6 @@ static const char* solveConstraintRowsCL= \
 "//				m_staticIdx = 0;\n"
 "		batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;\n"
 "	}\n"
-"\n"
 "	if (rbB->m_invMass)\n"
 "	{\n"
 "		batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;\n"
@@ -705,14 +573,11 @@ static const char* solveConstraintRowsCL= \
 "//				m_staticIdx = 0;\n"
 "		batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;\n"
 "	}\n"
-"\n"
 "	if (info1)\n"
 "	{\n"
 "		int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;\n"
 "//		if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)\n"
 "	//		m_maxOverrideNumSolverIterations = overrideNumSolverIterations;\n"
-"\n"
-"\n"
 "		int j;\n"
 "		for ( j=0;j<info1;j++)\n"
 "		{\n"
@@ -728,7 +593,6 @@ static const char* solveConstraintRowsCL= \
 "			currentConstraintRow[j].m_jacDiagABInv = 0.f;\n"
 "			currentConstraintRow[j].m_lowerLimit = 0.f;\n"
 "			currentConstraintRow[j].m_upperLimit = 0.f;\n"
-"\n"
 "			currentConstraintRow[j].m_originalConstraint = i;\n"
 "			currentConstraintRow[j].m_overrideNumSolverIterations = 0;\n"
 "			currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);\n"
@@ -746,7 +610,6 @@ static const char* solveConstraintRowsCL= \
 "			currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;\n"
 "			currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;		\n"
 "		}\n"
-"\n"
 "		bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n"
 "		bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n"
 "		bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);\n"
@@ -755,12 +618,8 @@ static const char* solveConstraintRowsCL= \
 "		bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n"
 "		bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);\n"
 "		bodyBPtr->m_turnVelocity  = (float4)(0,0,0,0);\n"
-"\n"
 "		int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n"
-"\n"
 "		\n"
-"\n"
-"\n"
 "		b3GpuConstraintInfo2 info2;\n"
 "		info2.fps = 1.f/timeStep;\n"
 "		info2.erp = globalErp;\n"
@@ -769,7 +628,6 @@ static const char* solveConstraintRowsCL= \
 "		info2.m_J2linearAxisFloat4 = 0;\n"
 "		info2.m_J2angularAxisFloat4 = &currentConstraintRow->m_relpos2CrossNormal;\n"
 "		info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n"
-"\n"
 "		///the size of b3SolverConstraint needs be a multiple of float\n"
 "//		b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));\n"
 "		info2.m_constraintError = &currentConstraintRow->m_rhs;\n"
@@ -779,7 +637,6 @@ static const char* solveConstraintRowsCL= \
 "		info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;\n"
 "		info2.m_upperLimit = &currentConstraintRow->m_upperLimit;\n"
 "		info2.m_numIterations = globalNumIterations;\n"
-"\n"
 "		switch (constraint->m_constraintType)\n"
 "		{\n"
 "			case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n"
@@ -790,37 +647,29 @@ static const char* solveConstraintRowsCL= \
 "			case B3_GPU_FIXED_CONSTRAINT_TYPE:\n"
 "			{\n"
 "				getInfo2Point2Point(constraint,&info2,bodies);\n"
-"\n"
 "				getInfo2FixedOrientation(constraint,&info2,bodies,3);\n"
-"\n"
 "				break;\n"
 "			}\n"
-"\n"
 "			default:\n"
 "			{\n"
 "			}\n"
 "		}\n"
-"\n"
 "		///finalize the constraint setup\n"
 "		for ( j=0;j<info1;j++)\n"
 "		{\n"
 "			__global b3SolverConstraint* solverConstraint = &currentConstraintRow[j];\n"
-"\n"
 "			if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)\n"
 "			{\n"
 "				solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;\n"
 "			}\n"
-"\n"
 "			if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)\n"
 "			{\n"
 "				solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;\n"
 "			}\n"
-"\n"
 "//						solverConstraint->m_originalContactPoint = constraint;\n"
 "							\n"
 "			Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;\n"
 "			{\n"
-"\n"
 "				//float4 angularFactorA(1,1,1);\n"
 "				float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;\n"
 "				solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;\n"
@@ -828,11 +677,9 @@ static const char* solveConstraintRowsCL= \
 "						\n"
 "			Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;\n"
 "			{\n"
-"\n"
 "				float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;\n"
 "				solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();\n"
 "			}\n"
-"\n"
 "			{\n"
 "				//it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal\n"
 "				//because it gets multiplied iMJlB\n"
@@ -840,7 +687,6 @@ static const char* solveConstraintRowsCL= \
 "				float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);\n"
 "				float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?\n"
 "				float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);\n"
-"\n"
 "				float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);\n"
 "				sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);\n"
 "				sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);\n"
@@ -854,17 +700,13 @@ static const char* solveConstraintRowsCL= \
 "					solverConstraint->m_jacDiagABInv = 0.f;\n"
 "				}\n"
 "			}\n"
-"\n"
-"\n"
 "			///fix rhs\n"
 "			///todo: add force/torque accelerators\n"
 "			{\n"
 "				float rel_vel;\n"
 "				float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);\n"
 "				float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);\n"
-"\n"
 "				rel_vel = vel1Dotn+vel2Dotn;\n"
-"\n"
 "				float restitution = 0.f;\n"
 "				float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2\n"
 "				float	velocityError = restitution - rel_vel * info2.m_damping;\n"
@@ -872,7 +714,6 @@ static const char* solveConstraintRowsCL= \
 "				float	velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;\n"
 "				solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;\n"
 "				solverConstraint->m_appliedImpulse = 0.f;\n"
-"\n"
 "			}\n"
 "		}\n"
 "	}\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl
@@ -204,22 +204,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-	
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
@@ -2,37 +2,29 @@
 static const char* solveContactCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,43 +38,28 @@ static const char* solveContactCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define mymake_float4 (float4)\n"
 "//#define make_float2 (float2)\n"
 "//#define make_uint4 (uint4)\n"
 "//#define make_int4 (int4)\n"
 "//#define make_uint2 (uint2)\n"
 "//#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -90,10 +67,6 @@ static const char* solveContactCL= \
 "	float4 b1 = mymake_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -102,33 +75,17 @@ static const char* solveContactCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -139,54 +96,39 @@ static const char* solveContactCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -195,34 +137,13 @@ static const char* solveContactCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"	\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -231,7 +152,6 @@ static const char* solveContactCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -240,27 +160,20 @@ static const char* solveContactCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = mymake_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
 "{\n"
@@ -271,32 +184,25 @@ static const char* solveContactCL= \
 "	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
 "	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
 "}\n"
-"\n"
-"\n"
 "void solveContact(__global Constraint4* cs,\n"
 "				  float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
 "				  float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
-"\n"
 "void solveContact(__global Constraint4* cs,\n"
 "			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
 "			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
 "{\n"
 "	float minRambdaDt = 0;\n"
 "	float maxRambdaDt = FLT_MAX;\n"
-"\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
-"\n"
 "		float4 angular0, angular1, linear;\n"
 "		float4 r0 = cs->m_worldPos[ic] - posA;\n"
 "		float4 r1 = cs->m_worldPos[ic] - posB;\n"
 "		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
-"\n"
 "		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
 "			*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
 "		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
-"\n"
 "		{\n"
 "			float prevSum = cs->m_appliedRambdaDt[ic];\n"
 "			float updated = prevSum;\n"
@@ -306,19 +212,16 @@ static const char* solveContactCL= \
 "			rambdaDt = updated - prevSum;\n"
 "			cs->m_appliedRambdaDt[ic] = updated;\n"
 "		}\n"
-"\n"
 "		float4 linImp0 = invMassA*linear*rambdaDt;\n"
 "		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
 "		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
 "		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
-"\n"
 "		*linVelA += linImp0;\n"
 "		*angVelA += angImp0;\n"
 "		*linVelB += linImp1;\n"
 "		*angVelB += angImp1;\n"
 "	}\n"
 "}\n"
-"\n"
 "void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
 " void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
 "{\n"
@@ -347,29 +250,24 @@ static const char* solveContactCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
 "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
 "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
 "{\n"
 "	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"\n"
 "	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "			posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
-"\n"
 "  if (gBodies[aIdx].m_invMass)\n"
 "  {\n"
 "		gBodies[aIdx].m_linVel = linVelA;\n"
@@ -390,27 +288,18 @@ static const char* solveContactCL= \
 "		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
 "	\n"
 "	}\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void BatchSolveKernelContact(__global Body* gBodies,\n"
@@ -427,33 +316,26 @@ static const char* solveContactCL= \
 "	__local int ldsCurBatch;\n"
 "	__local int ldsNextBatch;\n"
 "	__local int ldsStart;\n"
-"\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
-"\n"
 "//	int gIdx = GET_GLOBAL_IDX;\n"
 "//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
 "	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
-"\n"
-"\n"
 "	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
 "	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
 "	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
 "	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
 "	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
-"\n"
 "	//int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
 "	//int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
 "	//int cellIdx = xIdx+yIdx*nSplit;\n"
 "	\n"
 "	if( gN[cellIdx] == 0 ) \n"
 "		return;\n"
-"\n"
 "	\n"
 "	\n"
 "	const int start = gOffsets[cellIdx];\n"
 "	const int end = start + gN[cellIdx];\n"
-"\n"
 "	\n"
 "	\n"
 "	\n"
@@ -463,10 +345,7 @@ static const char* solveContactCL= \
 "		ldsNextBatch = 0;\n"
 "		ldsStart = start;\n"
 "	}\n"
-"\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	int idx=ldsStart+lIdx;\n"
 "	while (ldsCurBatch < maxBatch)\n"
 "	{\n"
@@ -475,7 +354,6 @@ static const char* solveContactCL= \
 "			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
 "			{\n"
 "					solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"\n"
 "				 idx+=64;\n"
 "			} else\n"
 "			{\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl
@@ -204,22 +204,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
@@ -2,37 +2,29 @@
 static const char* solveFrictionCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,43 +38,28 @@ static const char* solveFrictionCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define mymake_float4 (float4)\n"
 "//#define make_float2 (float2)\n"
 "//#define make_uint4 (uint4)\n"
 "//#define make_int4 (int4)\n"
 "//#define make_uint2 (uint2)\n"
 "//#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -90,10 +67,6 @@ static const char* solveFrictionCL= \
 "	float4 b1 = mymake_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -102,33 +75,17 @@ static const char* solveFrictionCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -139,54 +96,39 @@ static const char* solveFrictionCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -195,34 +137,13 @@ static const char* solveFrictionCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -231,7 +152,6 @@ static const char* solveFrictionCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -240,27 +160,20 @@ static const char* solveFrictionCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = mymake_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
 "{\n"
@@ -299,33 +212,26 @@ static const char* solveFrictionCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
-"\n"
 "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
 "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
 "{\n"
 "	float frictionCoeff = ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
 "	\n"
-"\n"
 "	{\n"
 "		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
 "		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
-"\n"
 "		float sum = 0;\n"
 "		for(int j=0; j<4; j++)\n"
 "		{\n"
@@ -337,7 +243,6 @@ static const char* solveFrictionCL= \
 "			maxRambdaDt[j] = frictionCoeff*sum;\n"
 "			minRambdaDt[j] = -maxRambdaDt[j];\n"
 "		}\n"
-"\n"
 "		\n"
 "//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
@@ -397,11 +302,9 @@ static const char* solveFrictionCL= \
 "				}\n"
 "			}\n"
 "		}\n"
-"\n"
 "		\n"
 "		\n"
 "	}\n"
-"\n"
 "	if (gBodies[aIdx].m_invMass)\n"
 "	{\n"
 "		gBodies[aIdx].m_linVel = linVelA;\n"
@@ -421,25 +324,18 @@ static const char* solveFrictionCL= \
 "		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
 "	}\n"
 " \n"
-"\n"
 "}\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void BatchSolveKernelFriction(__global Body* gBodies,\n"
@@ -456,28 +352,21 @@ static const char* solveFrictionCL= \
 "	__local int ldsCurBatch;\n"
 "	__local int ldsNextBatch;\n"
 "	__local int ldsStart;\n"
-"\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
-"\n"
 "//	int gIdx = GET_GLOBAL_IDX;\n"
 "//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
 "	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
-"\n"
-"\n"
 "	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
 "	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
 "	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
 "	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
 "	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
-"\n"
 "	\n"
 "	if( gN[cellIdx] == 0 ) \n"
 "		return;\n"
-"\n"
 "	const int start = gOffsets[cellIdx];\n"
 "	const int end = start + gN[cellIdx];\n"
-"\n"
 "	\n"
 "	if( lIdx == 0 )\n"
 "	{\n"
@@ -485,10 +374,7 @@ static const char* solveFrictionCL= \
 "		ldsNextBatch = 0;\n"
 "		ldsStart = start;\n"
 "	}\n"
-"\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	int idx=ldsStart+lIdx;\n"
 "	while (ldsCurBatch < maxBatch)\n"
 "	{\n"
@@ -496,9 +382,7 @@ static const char* solveFrictionCL= \
 "		{\n"
 "			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
 "			{\n"
-"\n"
 "					solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"\n"
 "				 idx+=64;\n"
 "			} else\n"
 "			{\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
@@ -14,6 +14,7 @@ subject to the following restrictions:
 */
 //Originally written by Takahiro Harada

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"

 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
@@ -403,22 +404,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
@@ -525,7 +511,7 @@ void btPlaneSpace1 (float4 n, float4* p, float4* q);

 void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
 	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, 
-	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,
+	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,
 	Constraint4* dstC )
 {
 	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
@@ -622,7 +608,7 @@ typedef struct

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, 
+void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, 
 int nContacts,
 float dt,
 float positionDrift,
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
@@ -2,37 +2,71 @@
 static const char* solverSetupCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,22 +80,15 @@ static const char* solverSetupCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
@@ -71,52 +98,43 @@ static const char* solverSetupCL= \
 "	return native_divide(numerator, denominator);	\n"
 "//	return numerator/denominator;	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastDiv4(float4 numerator, float4 denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastSqrtf(float f2)\n"
 "{\n"
 "	return native_sqrt(f2);\n"
 "//	return sqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastRSqrt(float f2)\n"
 "{\n"
 "	return native_rsqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastLength4(float4 v)\n"
 "{\n"
 "	return fast_length(v);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "float sqrtf(float a)\n"
 "{\n"
 "//	return sqrt(a);\n"
 "	return native_sqrt(a);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -124,26 +142,22 @@ static const char* solverSetupCL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float length3(const float4 a)\n"
 "{\n"
 "	return sqrtf(dot3F4(a,a));\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot4(const float4 a, const float4 b)\n"
 "{\n"
 "	return dot( a, b );\n"
 "}\n"
-"\n"
 "//	for height\n"
 "__inline\n"
 "float dot3w1(const float4 point, const float4 eqn)\n"
 "{\n"
 "	return dot3F4(point,eqn) + eqn.w;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -152,14 +166,12 @@ static const char* solverSetupCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize4(const float4 a)\n"
 "{\n"
 "	float length = sqrtf(dot4(a, a));\n"
 "	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
 "{\n"
@@ -170,34 +182,25 @@ static const char* solverSetupCL= \
 "	eqn.w = -dot3F4(eqn,a);\n"
 "	return eqn;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero()\n"
 "{\n"
@@ -207,7 +210,6 @@ static const char* solverSetupCL= \
 "	m.m_row[2] = (float4)(0.f);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity()\n"
 "{\n"
@@ -217,7 +219,6 @@ static const char* solverSetupCL= \
 "	m.m_row[2] = (float4)(0,0,1,0);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -227,7 +228,6 @@ static const char* solverSetupCL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -248,7 +248,6 @@ static const char* solverSetupCL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -259,44 +258,32 @@ static const char* solverSetupCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 qtGetRotationMatrix(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -307,7 +294,6 @@ static const char* solverSetupCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -324,67 +310,52 @@ static const char* solverSetupCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n"
 "{\n"
 "	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
 "	Matrix3x3 out;\n"
-"\n"
 "	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
 "	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
 "	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
 "	out.m_row[0].w = 0.f;\n"
-"\n"
 "	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
 "	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
 "	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
 "	out.m_row[1].w = 0.f;\n"
-"\n"
 "	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
 "	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
 "	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
 "	out.m_row[2].w = 0.f;\n"
-"\n"
 "	return out;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -393,34 +364,13 @@ static const char* solverSetupCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -429,7 +379,6 @@ static const char* solverSetupCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -438,22 +387,16 @@ static const char* solverSetupCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = make_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
 "{\n"
@@ -464,27 +407,18 @@ static const char* solverSetupCL= \
 "	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
 "	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 " \n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nContacts;\n"
@@ -492,8 +426,6 @@ static const char* solverSetupCL= \
 "	float m_scale;\n"
 "	int m_nSplit;\n"
 "} ConstBufferSSD;\n"
-"\n"
-"\n"
 "void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
 " void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
 "{\n"
@@ -522,84 +454,68 @@ static const char* solverSetupCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
-"\n"
 "void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
 "	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
-"	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
+"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
 "	Constraint4* dstC )\n"
 "{\n"
 "	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
 "	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
-"\n"
 "	float dtInv = 1.f/dt;\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "	}\n"
 "	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
-"\n"
-"\n"
 "	dstC->m_linear = -src->m_worldNormal;\n"
 "	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		float4 r0 = src->m_worldPos[ic] - posA;\n"
 "		float4 r1 = src->m_worldPos[ic] - posB;\n"
-"\n"
 "		if( ic >= src->m_worldNormal.w )//npoints\n"
 "		{\n"
 "			dstC->m_jacCoeffInv[ic] = 0.f;\n"
 "			continue;\n"
 "		}\n"
-"\n"
 "		float relVelN;\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
-"\n"
 "			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
 "				linVelA, angVelA, linVelB, angVelB);\n"
-"\n"
 "			float e = 0.f;//src->getRestituitionCoeff();\n"
 "			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
-"\n"
 "			dstC->m_b[ic] = e*relVelN;\n"
 "			//float penetration = src->m_worldPos[ic].w;\n"
 "			dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
 "			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "		}\n"
 "	}\n"
-"\n"
 "	if( src->m_worldNormal.w > 0 )//npoints\n"
 "	{	//	prepare friction\n"
 "		float4 center = make_float4(0.f);\n"
 "		for(int i=0; i<src->m_worldNormal.w; i++) \n"
 "			center += src->m_worldPos[i];\n"
 "		center /= (float)src->m_worldNormal.w;\n"
-"\n"
 "		float4 tangent[2];\n"
 "		btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
 "		\n"
 "		float4 r[2];\n"
 "		r[0] = center - posA;\n"
 "		r[1] = center - posB;\n"
-"\n"
 "		for(int i=0; i<2; i++)\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
 "			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
 "		}\n"
 "		dstC->m_center = center;\n"
 "	}\n"
-"\n"
 "	for(int i=0; i<4; i++)\n"
 "	{\n"
 "		if( i<src->m_worldNormal.w )\n"
@@ -612,7 +528,6 @@ static const char* solverSetupCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nContacts;\n"
@@ -620,10 +535,9 @@ static const char* solverSetupCL= \
 "	float m_positionDrift;\n"
 "	float m_positionConstraintCoeff;\n"
 "} ConstBufferCTC;\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, \n"
+"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, \n"
 "int nContacts,\n"
 "float dt,\n"
 "float positionDrift,\n"
@@ -636,33 +550,23 @@ static const char* solverSetupCL= \
 "	{\n"
 "		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
 "		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
-"\n"
 "		float4 posA = gBodies[aIdx].m_pos;\n"
 "		float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "		float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "		float invMassA = gBodies[aIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "		float4 posB = gBodies[bIdx].m_pos;\n"
 "		float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "		float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "		float invMassB = gBodies[bIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"\n"
 "		Constraint4 cs;\n"
-"\n"
 "    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
 "			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n"
 "			&cs );\n"
 "		\n"
 "		cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
-"\n"
 "		gConstraintOut[gIdx] = cs;\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
@@ -14,6 +14,8 @@ subject to the following restrictions:
 //Originally written by Takahiro Harada


+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
@@ -377,22 +379,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
@@ -435,7 +422,7 @@ typedef struct
 //	others
 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )
+void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )
 {
 	int nContacts = cb.x;
 	int gIdx = GET_GLOBAL_IDX;
@@ -448,7 +435,7 @@ void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __globa
 }

 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataChildShapeB(__global Contact4* contactsIn, __global int2* sortDataOut, int nContacts)
+void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -462,7 +449,7 @@ void SetDeterminismSortDataChildShapeB(__global Contact4* contactsIn, __global i
 }

 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataChildShapeA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)
+void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -478,7 +465,7 @@ void SetDeterminismSortDataChildShapeA(__global Contact4* contactsIn, __global i
 }

 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)
+void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -496,7 +483,7 @@ void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* s

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataBodyB(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)
+void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -552,7 +539,7 @@ static __constant const int gridTable8x8[] =

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, 
+void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, 
 int nContacts,float scale,int4 nSplit,int staticIdx)

 {
@@ -613,7 +600,7 @@ int nContacts,float scale,int4 nSplit,int staticIdx)

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb )
+void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )
 {
 	int gIdx = GET_GLOBAL_IDX;
 	if( gIdx < cb.x )
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
@@ -2,37 +2,71 @@
 static const char* solverSetup2CL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,22 +80,15 @@ static const char* solverSetup2CL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
@@ -71,52 +98,43 @@ static const char* solverSetup2CL= \
 "	return native_divide(numerator, denominator);	\n"
 "//	return numerator/denominator;	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastDiv4(float4 numerator, float4 denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastSqrtf(float f2)\n"
 "{\n"
 "	return native_sqrt(f2);\n"
 "//	return sqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastRSqrt(float f2)\n"
 "{\n"
 "	return native_rsqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastLength4(float4 v)\n"
 "{\n"
 "	return fast_length(v);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "float sqrtf(float a)\n"
 "{\n"
 "//	return sqrt(a);\n"
 "	return native_sqrt(a);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -124,26 +142,22 @@ static const char* solverSetup2CL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float length3(const float4 a)\n"
 "{\n"
 "	return sqrtf(dot3F4(a,a));\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot4(const float4 a, const float4 b)\n"
 "{\n"
 "	return dot( a, b );\n"
 "}\n"
-"\n"
 "//	for height\n"
 "__inline\n"
 "float dot3w1(const float4 point, const float4 eqn)\n"
 "{\n"
 "	return dot3F4(point,eqn) + eqn.w;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -152,14 +166,12 @@ static const char* solverSetup2CL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize4(const float4 a)\n"
 "{\n"
 "	float length = sqrtf(dot4(a, a));\n"
 "	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
 "{\n"
@@ -170,34 +182,25 @@ static const char* solverSetup2CL= \
 "	eqn.w = -dot3F4(eqn,a);\n"
 "	return eqn;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero()\n"
 "{\n"
@@ -207,7 +210,6 @@ static const char* solverSetup2CL= \
 "	m.m_row[2] = (float4)(0.f);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity()\n"
 "{\n"
@@ -217,7 +219,6 @@ static const char* solverSetup2CL= \
 "	m.m_row[2] = (float4)(0,0,1,0);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -227,7 +228,6 @@ static const char* solverSetup2CL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -248,7 +248,6 @@ static const char* solverSetup2CL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -259,43 +258,30 @@ static const char* solverSetup2CL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -306,7 +292,6 @@ static const char* solverSetup2CL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -323,43 +308,33 @@ static const char* solverSetup2CL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -368,34 +343,13 @@ static const char* solverSetup2CL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -404,7 +358,6 @@ static const char* solverSetup2CL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -413,47 +366,35 @@ static const char* solverSetup2CL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"\n"
-"\n"
 " \n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "//	others\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )\n"
+"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
 "{\n"
 "	int nContacts = cb.x;\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int srcIdx = sortData[gIdx].y;\n"
 "		out[gIdx] = in[srcIdx];\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataChildShapeB(__global Contact4* contactsIn, __global int2* sortDataOut, int nContacts)\n"
+"void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sd;\n"
@@ -462,12 +403,10 @@ static const char* solverSetup2CL= \
 "		sortDataOut[gIdx] = sd;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataChildShapeA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sdIn;\n"
@@ -478,12 +417,10 @@ static const char* solverSetup2CL= \
 "		sortDataInOut[gIdx] = sdOut;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sdIn;\n"
@@ -494,14 +431,11 @@ static const char* solverSetup2CL= \
 "		sortDataInOut[gIdx] = sdOut;\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataBodyB(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sdIn;\n"
@@ -512,10 +446,6 @@ static const char* solverSetup2CL= \
 "		sortDataInOut[gIdx] = sdOut;\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nContacts;\n"
@@ -523,8 +453,6 @@ static const char* solverSetup2CL= \
 "	float m_scale;\n"
 "	int m_nSplit;\n"
 "} ConstBufferSSD;\n"
-"\n"
-"\n"
 "static __constant const int gridTable4x4[] = \n"
 "{\n"
 "    0,1,17,16,\n"
@@ -532,7 +460,6 @@ static const char* solverSetup2CL= \
 "	17,18,32,3,\n"
 "	16,19,3,34\n"
 "};\n"
-"\n"
 "static __constant const int gridTable8x8[] = \n"
 "{\n"
 "	  0,  2,  3, 16, 17, 18, 19,  1,\n"
@@ -545,18 +472,12 @@ static const char* solverSetup2CL= \
 "	197,27,214,213,212,199,198,196\n"
 "	\n"
 "};\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define USE_SPATIAL_BATCHING 1\n"
 "#define USE_4x4_GRID 1\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n"
+"void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n"
 "int nContacts,float scale,int4 nSplit,int staticIdx)\n"
-"\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	\n"
@@ -564,13 +485,10 @@ static const char* solverSetup2CL= \
 "	{\n"
 "		int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;\n"
 "		int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;\n"
-"\n"
 "		int aIdx = abs(aPtrAndSignBit );\n"
 "		int bIdx = abs(bPtrAndSignBit);\n"
-"\n"
 "		bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n"
 "		bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n"
-"\n"
 "#if USE_SPATIAL_BATCHING		\n"
 "		int idx = (aStatic)? bIdx: aIdx;\n"
 "		float4 p = gBodies[idx].m_pos;\n"
@@ -587,7 +505,6 @@ static const char* solverSetup2CL= \
 "			aa = bb;\n"
 "		if (bStatic)\n"
 "			bb = aa;\n"
-"\n"
 "		int gridIndex = aa + bb*4;\n"
 "		int newIndex = gridTable4x4[gridIndex];\n"
 "	#else//USE_4x4_GRID\n"
@@ -597,13 +514,10 @@ static const char* solverSetup2CL= \
 "			aa = bb;\n"
 "		if (bStatic)\n"
 "			bb = aa;\n"
-"\n"
 "		int gridIndex = aa + bb*8;\n"
 "		int newIndex = gridTable8x8[gridIndex];\n"
 "	#endif//USE_4x4_GRID\n"
 "#endif//USE_SPATIAL_BATCHING\n"
-"\n"
-"\n"
 "		gSortDataOut[gIdx].x = newIndex;\n"
 "		gSortDataOut[gIdx].y = gIdx;\n"
 "	}\n"
@@ -612,10 +526,9 @@ static const char* solverSetup2CL= \
 "		gSortDataOut[gIdx].x = 0xffffffff;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb )\n"
+"void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	if( gIdx < cb.x )\n"
@@ -623,7 +536,4 @@ static const char* solverSetup2CL= \
 "		gOut[gIdx] = gIn[gIdx];\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl
@@ -13,6 +13,8 @@ subject to the following restrictions:
 */
 //Originally written by Erwin Coumans

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
@@ -380,25 +382,10 @@ typedef struct
 	u32 m_paddings;
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;
-
-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;


-__kernel void CountBodiesKernel(__global Contact4* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)
+
+__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)
 {
 	int i = GET_GLOBAL_IDX;
 	
@@ -844,7 +831,7 @@ __kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* of

 void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
 	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, 
-	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,
+	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,
 	Constraint4* dstC )
 {
 	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
@@ -934,7 +921,7 @@ void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVe

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void ContactToConstraintSplitKernel(__global const Contact4* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, 
+void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, 
 __global const unsigned int* bodyCount,
 int nContacts,
 float dt,
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
@@ -2,36 +2,71 @@
 static const char* solverUtilsCL= \
 "/*\n"
 "Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -45,22 +80,15 @@ static const char* solverUtilsCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
@@ -70,57 +98,47 @@ static const char* solverUtilsCL= \
 "	return native_divide(numerator, denominator);	\n"
 "//	return numerator/denominator;	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastDiv4(float4 numerator, float4 denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastSqrtf(float f2)\n"
 "{\n"
 "	return native_sqrt(f2);\n"
 "//	return sqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastRSqrt(float f2)\n"
 "{\n"
 "	return native_rsqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastLength4(float4 v)\n"
 "{\n"
 "	return fast_length(v);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "float sqrtf(float a)\n"
 "{\n"
 "//	return sqrt(a);\n"
 "	return native_sqrt(a);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a1, float4 b1)\n"
 "{\n"
-"\n"
 "	float4 	a=make_float4(a1.xyz,0.f);\n"
 "	float4 	b=make_float4(b1.xyz,0.f);\n"
 "	//float4 	a=a1;\n"
 "	//float4 	b=b1;\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -128,26 +146,22 @@ static const char* solverUtilsCL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float length3(const float4 a)\n"
 "{\n"
 "	return sqrtf(dot3F4(a,a));\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot4(const float4 a, const float4 b)\n"
 "{\n"
 "	return dot( a, b );\n"
 "}\n"
-"\n"
 "//	for height\n"
 "__inline\n"
 "float dot3w1(const float4 point, const float4 eqn)\n"
 "{\n"
 "	return dot3F4(point,eqn) + eqn.w;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -156,14 +170,12 @@ static const char* solverUtilsCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize4(const float4 a)\n"
 "{\n"
 "	float length = sqrtf(dot4(a, a));\n"
 "	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
 "{\n"
@@ -174,34 +186,25 @@ static const char* solverUtilsCL= \
 "	eqn.w = -dot3F4(eqn,a);\n"
 "	return eqn;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero()\n"
 "{\n"
@@ -211,7 +214,6 @@ static const char* solverUtilsCL= \
 "	m.m_row[2] = (float4)(0.f);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity()\n"
 "{\n"
@@ -221,7 +223,6 @@ static const char* solverUtilsCL= \
 "	m.m_row[2] = (float4)(0,0,1,0);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -231,7 +232,6 @@ static const char* solverUtilsCL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -252,7 +252,6 @@ static const char* solverUtilsCL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -263,43 +262,30 @@ static const char* solverUtilsCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -310,7 +296,6 @@ static const char* solverUtilsCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -327,43 +312,33 @@ static const char* solverUtilsCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -372,35 +347,14 @@ static const char* solverUtilsCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings;\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
-"\n"
-"__kernel void CountBodiesKernel(__global Contact4* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n"
+"__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n"
 "{\n"
 "	int i = GET_GLOBAL_IDX;\n"
 "	\n"
@@ -423,7 +377,6 @@ static const char* solverUtilsCL= \
 "		} \n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n"
 "{\n"
 "	int i = GET_GLOBAL_IDX;\n"
@@ -434,8 +387,6 @@ static const char* solverUtilsCL= \
 "		angularVelocities[i] = make_float4(0);\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
 "__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
 "{\n"
@@ -465,23 +416,16 @@ static const char* solverUtilsCL= \
 "		}//bodies[i].m_invMass\n"
 "	}//i<numBodies\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = make_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n"
 "{\n"
@@ -492,8 +436,6 @@ static const char* solverUtilsCL= \
 "	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
 "	return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n"
 "}\n"
-"\n"
-"\n"
 "void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
 " void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
 "{\n"
@@ -522,11 +464,6 @@ static const char* solverUtilsCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "void solveContact(__global Constraint4* cs,\n"
 "			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
 "			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n"
@@ -534,22 +471,17 @@ static const char* solverUtilsCL= \
 "{\n"
 "	float minRambdaDt = 0;\n"
 "	float maxRambdaDt = FLT_MAX;\n"
-"\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
-"\n"
 "		float4 angular0, angular1, linear;\n"
 "		float4 r0 = cs->m_worldPos[ic] - posA;\n"
 "		float4 r1 = cs->m_worldPos[ic] - posB;\n"
 "		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
 "	\n"
-"\n"
-"\n"
 "		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
 "			*linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n"
 "		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
-"\n"
 "		\n"
 "		{\n"
 "			float prevSum = cs->m_appliedRambdaDt[ic];\n"
@@ -560,13 +492,11 @@ static const char* solverUtilsCL= \
 "			rambdaDt = updated - prevSum;\n"
 "			cs->m_appliedRambdaDt[ic] = updated;\n"
 "		}\n"
-"\n"
 "			\n"
 "		float4 linImp0 = invMassA*linear*rambdaDt;\n"
 "		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
 "		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
 "		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
-"\n"
 "		\n"
 "		if (invMassA)\n"
 "		{\n"
@@ -580,32 +510,24 @@ static const char* solverUtilsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "//	solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
-"\n"
-"\n"
 "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n"
 "__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
 "__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
 "{\n"
-"\n"
 "	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"\n"
 "			\n"
 "	float4 dLinVelA = make_float4(0,0,0,0);\n"
 "	float4 dAngVelA = make_float4(0,0,0,0);\n"
@@ -621,20 +543,16 @@ static const char* solverUtilsCL= \
 "		dLinVelA = deltaLinearVelocities[splitIndexA];\n"
 "		dAngVelA = deltaAngularVelocities[splitIndexA];\n"
 "	}\n"
-"\n"
 "	int bodyOffsetB = offsetSplitBodies[bIdx];\n"
 "	int constraintOffsetB = contactConstraintOffsets[0].y;\n"
 "	int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
-"\n"
 "	if (invMassB)\n"
 "	{\n"
 "		dLinVelB = deltaLinearVelocities[splitIndexB];\n"
 "		dAngVelB = deltaAngularVelocities[splitIndexB];\n"
 "	}\n"
-"\n"
 "	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "			posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n"
-"\n"
 "	if (invMassA)\n"
 "	{\n"
 "		deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
@@ -645,10 +563,7 @@ static const char* solverUtilsCL= \
 "		deltaLinearVelocities[splitIndexB] = dLinVelB;\n"
 "		deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
 "	}\n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
 "__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
 "float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n"
@@ -660,10 +575,6 @@ static const char* solverUtilsCL= \
 "		solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n"
 "							__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
 "							__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
@@ -671,21 +582,17 @@ static const char* solverUtilsCL= \
 "	float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
 "	\n"
-"\n"
 "	float4 dLinVelA = make_float4(0,0,0,0);\n"
 "	float4 dAngVelA = make_float4(0,0,0,0);\n"
 "	float4 dLinVelB = make_float4(0,0,0,0);\n"
@@ -700,24 +607,17 @@ static const char* solverUtilsCL= \
 "		dLinVelA = deltaLinearVelocities[splitIndexA];\n"
 "		dAngVelA = deltaAngularVelocities[splitIndexA];\n"
 "	}\n"
-"\n"
 "	int bodyOffsetB = offsetSplitBodies[bIdx];\n"
 "	int constraintOffsetB = contactConstraintOffsets[0].y;\n"
 "	int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
-"\n"
 "	if (invMassB)\n"
 "	{\n"
 "		dLinVelB = deltaLinearVelocities[splitIndexB];\n"
 "		dAngVelB = deltaAngularVelocities[splitIndexB];\n"
 "	}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "	{\n"
 "		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
 "		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
-"\n"
 "		float sum = 0;\n"
 "		for(int j=0; j<4; j++)\n"
 "		{\n"
@@ -729,7 +629,6 @@ static const char* solverUtilsCL= \
 "			maxRambdaDt[j] = frictionCoeff*sum;\n"
 "			minRambdaDt[j] = -maxRambdaDt[j];\n"
 "		}\n"
-"\n"
 "		\n"
 "//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
@@ -789,11 +688,9 @@ static const char* solverUtilsCL= \
 "				}\n"
 "			}\n"
 "		}\n"
-"\n"
 "		\n"
 "		\n"
 "	}\n"
-"\n"
 "	if (invMassA)\n"
 "	{\n"
 "		deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
@@ -805,10 +702,7 @@ static const char* solverUtilsCL= \
 "		deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
 "	}\n"
 " \n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
 "										__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
 "										__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
@@ -821,8 +715,6 @@ static const char* solverUtilsCL= \
 "		solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
 "									__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
 "{\n"
@@ -841,85 +733,68 @@ static const char* solverUtilsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
 "	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
-"	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n"
+"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n"
 "	Constraint4* dstC )\n"
 "{\n"
 "	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
 "	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
-"\n"
 "	float dtInv = 1.f/dt;\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "	}\n"
 "	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
-"\n"
-"\n"
 "	dstC->m_linear = -src->m_worldNormal;\n"
 "	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		float4 r0 = src->m_worldPos[ic] - posA;\n"
 "		float4 r1 = src->m_worldPos[ic] - posB;\n"
-"\n"
 "		if( ic >= src->m_worldNormal.w )//npoints\n"
 "		{\n"
 "			dstC->m_jacCoeffInv[ic] = 0.f;\n"
 "			continue;\n"
 "		}\n"
-"\n"
 "		float relVelN;\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n"
-"\n"
 "			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
 "				linVelA, angVelA, linVelB, angVelB);\n"
-"\n"
 "			float e = 0.f;//src->getRestituitionCoeff();\n"
 "			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
-"\n"
 "			dstC->m_b[ic] = e*relVelN;\n"
 "			//float penetration = src->m_worldPos[ic].w;\n"
 "			dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
 "			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "		}\n"
 "	}\n"
-"\n"
 "	if( src->m_worldNormal.w > 0 )//npoints\n"
 "	{	//	prepare friction\n"
 "		float4 center = make_float4(0.f);\n"
 "		for(int i=0; i<src->m_worldNormal.w; i++) \n"
 "			center += src->m_worldPos[i];\n"
 "		center /= (float)src->m_worldNormal.w;\n"
-"\n"
 "		float4 tangent[2];\n"
 "		btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
 "		\n"
 "		float4 r[2];\n"
 "		r[0] = center - posA;\n"
 "		r[1] = center - posB;\n"
-"\n"
 "		for(int i=0; i<2; i++)\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n"
 "			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
 "		}\n"
 "		dstC->m_center = center;\n"
 "	}\n"
-"\n"
 "	for(int i=0; i<4; i++)\n"
 "	{\n"
 "		if( i<src->m_worldNormal.w )\n"
@@ -932,11 +807,9 @@ static const char* solverUtilsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void ContactToConstraintSplitKernel(__global const Contact4* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n"
+"void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n"
 "__global const unsigned int* bodyCount,\n"
 "int nContacts,\n"
 "float dt,\n"
@@ -950,30 +823,24 @@ static const char* solverUtilsCL= \
 "	{\n"
 "		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
 "		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
-"\n"
 "		float4 posA = gBodies[aIdx].m_pos;\n"
 "		float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "		float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "		float invMassA = gBodies[aIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "		float4 posB = gBodies[bIdx].m_pos;\n"
 "		float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "		float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "		float invMassB = gBodies[bIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"\n"
 "		Constraint4 cs;\n"
-"\n"
 "		float countA = invMassA ? (float)bodyCount[aIdx] : 1;\n"
 "		float countB = invMassB ? (float)bodyCount[bIdx] : 1;\n"
-"\n"
 "    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
 "			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n"
 "			&cs  );\n"
 "		\n"
 "		cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
-"\n"
 "		gConstraintOut[gIdx] = cs;\n"
 "	}\n"
 "}\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
@@ -1,15 +1,12 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
 static const char* updateAabbsKernelCL= \
 "#define SHAPE_CONVEX_HULL 3\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -17,8 +14,6 @@ static const char* updateAabbsKernelCL= \
 "	float4 b1 = (float4)(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -28,13 +23,11 @@ static const char* updateAabbsKernelCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec)\n"
 "{\n"
@@ -44,34 +37,27 @@ static const char* updateAabbsKernelCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
 "{\n"
 "	return qtRotate( *orientation, *p ) + (*translation);\n"
 "}\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4	m_row[3];\n"
 "} Matrix3x3;\n"
-"\n"
 "typedef unsigned int u32;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_collidableIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct Collidable\n"
 "{\n"
 "	int m_unused1;\n"
@@ -79,40 +65,30 @@ static const char* updateAabbsKernelCL= \
 "	int m_shapeType;\n"
 "	int m_shapeIndex;\n"
 "} Collidable;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
-"\n"
 "__inline\n"
 "Matrix3x3 qtGetRotationMatrix(float4 quat)\n"
 "{\n"
 "	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
 "	Matrix3x3 out;\n"
-"\n"
 "	out.m_row[0].x=fabs(1-2*quat2.y-2*quat2.z);\n"
 "	out.m_row[0].y=fabs(2*quat.x*quat.y-2*quat.w*quat.z);\n"
 "	out.m_row[0].z=fabs(2*quat.x*quat.z+2*quat.w*quat.y);\n"
 "	out.m_row[0].w = 0.f;\n"
-"\n"
 "	out.m_row[1].x=fabs(2*quat.x*quat.y+2*quat.w*quat.z);\n"
 "	out.m_row[1].y=fabs(1-2*quat2.x-2*quat2.z);\n"
 "	out.m_row[1].z=fabs(2*quat.y*quat.z-2*quat.w*quat.x);\n"
 "	out.m_row[1].w = 0.f;\n"
-"\n"
 "	out.m_row[2].x=fabs(2*quat.x*quat.z-2*quat.w*quat.y);\n"
 "	out.m_row[2].y=fabs(2*quat.y*quat.z+2*quat.w*quat.x);\n"
 "	out.m_row[2].z=fabs(1-2*quat2.x-2*quat2.y);\n"
 "	out.m_row[2].w = 0.f;\n"
-"\n"
 "	return out;\n"
 "}\n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	float			fx;\n"
@@ -120,7 +96,6 @@ static const char* updateAabbsKernelCL= \
 "	float			fz;\n"
 "	int	uw;\n"
 "} btAABBCL;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -130,9 +105,6 @@ static const char* updateAabbsKernelCL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -153,8 +125,6 @@ static const char* updateAabbsKernelCL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void initializeGpuAabbsFull(  const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)\n"
 "{\n"
 "	int nodeID = get_global_id(0);\n"