change lcpp Lua preprocessor, to keep #defines and comments, remove empty lines

remove duplicate data in b3Contact4 (now in btContact4Data shared between CPU/C++ and OpenCL) OpenCL kernels use #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h" Increase number of batches back to 250 (from 50), need to fix this hard coded number (see https://github.com/erwincoumans/bullet3/issues/12) Work towards GJK/EPA, in addition to SAT/clipping (early on)
2013-08-08 12:24:09 -07:00
parent 46a08e3282
commit 3bf003ace1
50 changed files with 920 additions and 2731 deletions
--- a/btgui/OpenGLWindow/Shaders/createShadowMapInstancingPS.h
+++ b/btgui/OpenGLWindow/Shaders/createShadowMapInstancingPS.h
@@ -2,10 +2,7 @@
 static const char* createShadowMapInstancingFragmentShader= \
 "#version 330\n"
 "precision highp float;\n"
-"\n"
-"\n"
 "layout(location = 0) out float fragmentdepth;\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "	fragmentdepth = gl_FragCoord.z;\n"
--- a/btgui/OpenGLWindow/Shaders/createShadowMapInstancingVS.h
+++ b/btgui/OpenGLWindow/Shaders/createShadowMapInstancingVS.h
@@ -2,8 +2,6 @@
 static const char* createShadowMapInstancingVertexShader= \
 "#version 330\n"
 "precision highp float;\n"
-"\n"
-"\n"
 "layout (location = 0) in vec4 position;\n"
 "layout (location = 1) in vec4 instance_position;\n"
 "layout (location = 2) in vec4 instance_quaternion;\n"
@@ -11,11 +9,7 @@ static const char* createShadowMapInstancingVertexShader= \
 "layout (location = 4) in vec3 vertexnormal;\n"
 "layout (location = 5) in vec4 instance_color;\n"
 "layout (location = 6) in vec3 instance_scale;\n"
-"\n"
-"\n"
 "uniform mat4 depthMVP;\n"
-"\n"
-"\n"
 "vec4 quatMul ( in vec4 q1, in vec4 q2 )\n"
 "{\n"
 "    vec3  im = q1.w * q2.xyz + q1.xyz * q2.w + cross ( q1.xyz, q2.xyz );\n"
@@ -23,7 +17,6 @@ static const char* createShadowMapInstancingVertexShader= \
 "    float re = dot ( dt, vec4 ( -1.0, -1.0, -1.0, 1.0 ) );\n"
 "    return vec4 ( im, re );\n"
 "}\n"
-"\n"
 "vec4 quatFromAxisAngle(vec4 axis, in float angle)\n"
 "{\n"
 "    float cah = cos(angle*0.5);\n"
@@ -45,8 +38,6 @@ static const char* createShadowMapInstancingVertexShader= \
 "    vec4 temp = quatMul ( q, p );\n"
 "    return quatMul ( temp, vec4 ( -q.x, -q.y, -q.z, q.w ) );\n"
 "}\n"
-"\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "	vec4 q = instance_quaternion;\n"
@@ -54,5 +45,4 @@ static const char* createShadowMapInstancingVertexShader= \
 "	vec4 vertexPos = depthMVP * vec4( (instance_position+localcoord).xyz,1);\n"
 "	gl_Position = vertexPos;\n"
 "}\n"
-"\n"
 ;
--- a/btgui/OpenGLWindow/Shaders/instancingPS.h
+++ b/btgui/OpenGLWindow/Shaders/instancingPS.h
@@ -2,26 +2,21 @@
 static const char* instancingFragmentShader= \
 "#version 330\n"
 "precision highp float;\n"
-"\n"
 "in Fragment\n"
 "{\n"
 "     vec4 color;\n"
 "} fragment;\n"
-"\n"
 "in Vert\n"
 "{\n"
 "	vec2 texcoord;\n"
 "} vert;\n"
-"\n"
 "uniform sampler2D Diffuse;\n"
 "in vec3 lightDir,normal,ambient;\n"
 "out vec4 color;\n"
-"\n"
 "void main_textured(void)\n"
 "{\n"
 "   color  = vec4(0.1,0.2,0.3,0.3);\n"
 "}\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "    vec4 texel = fragment.color*texture(Diffuse,vert.texcoord);//fragment.color;\n"
--- a/btgui/OpenGLWindow/Shaders/instancingVS.h
+++ b/btgui/OpenGLWindow/Shaders/instancingVS.h
@@ -2,8 +2,6 @@
 static const char* instancingVertexShader= \
 "#version 330\n"
 "precision highp float;\n"
-"\n"
-"\n"
 "layout (location = 0) in vec4 position;\n"
 "layout (location = 1) in vec4 instance_position;\n"
 "layout (location = 2) in vec4 instance_quaternion;\n"
@@ -11,22 +9,16 @@ static const char* instancingVertexShader= \
 "layout (location = 4) in vec3 vertexnormal;\n"
 "layout (location = 5) in vec4 instance_color;\n"
 "layout (location = 6) in vec3 instance_scale;\n"
-"\n"
-"\n"
 "uniform mat4 ModelViewMatrix;\n"
 "uniform mat4 ProjectionMatrix;\n"
-"\n"
 "out Fragment\n"
 "{\n"
 "     vec4 color;\n"
 "} fragment;\n"
-"\n"
 "out Vert\n"
 "{\n"
 "	vec2 texcoord;\n"
 "} vert;\n"
-"\n"
-"\n"
 "vec4 quatMul ( in vec4 q1, in vec4 q2 )\n"
 "{\n"
 "    vec3  im = q1.w * q2.xyz + q1.xyz * q2.w + cross ( q1.xyz, q2.xyz );\n"
@@ -34,7 +26,6 @@ static const char* instancingVertexShader= \
 "    float re = dot ( dt, vec4 ( -1.0, -1.0, -1.0, 1.0 ) );\n"
 "    return vec4 ( im, re );\n"
 "}\n"
-"\n"
 "vec4 quatFromAxisAngle(vec4 axis, in float angle)\n"
 "{\n"
 "    float cah = cos(angle*0.5);\n"
@@ -56,9 +47,7 @@ static const char* instancingVertexShader= \
 "    vec4 temp = quatMul ( q, p );\n"
 "    return quatMul ( temp, vec4 ( -q.x, -q.y, -q.z, q.w ) );\n"
 "}\n"
-"\n"
 "out vec3 lightDir,normal,ambient;\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "	vec4 q = instance_quaternion;\n"
@@ -68,18 +57,15 @@ static const char* instancingVertexShader= \
 "	vec4 local_normal = (quatRotate3( vertexnormal,q));\n"
 "	vec3 light_pos = vec3(-0.3,0.1,0.1);\n"
 "	normal = local_normal.xyz;//normalize(ModelViewMatrix * local_normal).xyz;\n"
-"\n"
 "	lightDir = normalize(light_pos);//gl_LightSource[0].position.xyz));\n"
 "//	lightDir = normalize(vec3(gl_LightSource[0].position));\n"
 "		\n"
 "	vec4 axis = vec4(1,1,1,0);\n"
 "	vec4 localcoord = quatRotate3( position.xyz*instance_scale,q);\n"
 "	vec4 vertexPos = ProjectionMatrix * ModelViewMatrix *(instance_position+localcoord);\n"
-"\n"
 "	gl_Position = vertexPos;\n"
 "	\n"
 "	fragment.color = instance_color;\n"
 "	vert.texcoord = uvcoords;\n"
 "}\n"
-"\n"
 ;
--- a/btgui/OpenGLWindow/Shaders/pointSpritePS.h
+++ b/btgui/OpenGLWindow/Shaders/pointSpritePS.h
@@ -2,22 +2,16 @@
 static const char* pointSpriteFragmentShader= \
 "#version 330\n"
 "precision highp float;\n"
-"\n"
 "in Fragment\n"
 "{\n"
 "     vec4 color;\n"
 "} fragment;\n"
-"\n"
-"\n"
 "in vec3 ambient;\n"
-"\n"
 "out vec4 color;\n"
-"\n"
 "void main_textured(void)\n"
 "{\n"
 "    color =  fragment.color;//texture2D(Diffuse,vert.texcoord);//fragment.color;\n"
 "}\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "	vec3 N;\n"
--- a/btgui/OpenGLWindow/Shaders/pointSpriteVS.h
+++ b/btgui/OpenGLWindow/Shaders/pointSpriteVS.h
@@ -2,34 +2,23 @@
 static const char* pointSpriteVertexShader= \
 "#version 330\n"
 "precision highp float;\n"
-"\n"
-"\n"
-"\n"
 "layout (location = 0) in vec4 position;\n"
 "layout (location = 1) in vec4 instance_position;\n"
 "layout (location = 3) in vec2 uvcoords;\n"
 "layout (location = 4) in vec3 vertexnormal;\n"
 "layout (location = 5) in vec4 instance_color;\n"
 "layout (location = 6) in vec3 instance_scale;\n"
-"\n"
-"\n"
 "uniform float screenWidth = 700.f;\n"
 "uniform mat4 ModelViewMatrix;\n"
 "uniform mat4 ProjectionMatrix;\n"
-"\n"
 "out Fragment\n"
 "{\n"
 "     vec4 color;\n"
 "} fragment;\n"
-"\n"
-"\n"
-"\n"
 "//\n"
 "// vector rotation via quaternion\n"
 "//\n"
-"\n"
 "out vec3 ambient;\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "	ambient = vec3(0.3,.3,0.3);\n"
@@ -41,7 +30,6 @@ static const char* pointSpriteVertexShader= \
 "   float dist = length(posEye);\n"
 "	float pointRadius = 1.f;\n"
 "    gl_PointSize = instance_scale.x * pointRadius * (screenWidth / dist);\n"
-"\n"
 "	gl_Position = vertexPos;\n"
 "	\n"
 "	fragment.color = instance_color;\n"
--- a/btgui/OpenGLWindow/Shaders/useShadowMapInstancingPS.h
+++ b/btgui/OpenGLWindow/Shaders/useShadowMapInstancingPS.h
@@ -2,27 +2,19 @@
 static const char* useShadowMapInstancingFragmentShader= \
 "#version 330 core\n"
 "//precision highp float;\n"
-"\n"
 "in Fragment\n"
 "{\n"
 "     vec4 color;\n"
 "} fragment;\n"
-"\n"
 "in Vert\n"
 "{\n"
 "	vec2 texcoord;\n"
 "} vert;\n"
-"\n"
 "uniform sampler2D Diffuse;\n"
 "uniform sampler2DShadow shadowMap;\n"
-"\n"
 "in vec3 lightDir,normal,ambient;\n"
 "in vec4 ShadowCoord;\n"
-"\n"
 "out vec4 color;\n"
-"\n"
-"\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "    vec4 texel = fragment.color*texture(Diffuse,vert.texcoord);//fragment.color;\n"
@@ -41,8 +33,6 @@ static const char* useShadowMapInstancingFragmentShader= \
 "	\n"
 "	float bias = 0.005*tan(acos(intensity));\n"
 "	bias = clamp(bias, 0,0.01);\n"
-"\n"
-"\n"
 "	float visibility = texture(shadowMap, vec3(ShadowCoord.xy,(ShadowCoord.z-bias)/ShadowCoord.w));\n"
 "	\n"
 "	intensity*=2;\n"
--- a/btgui/OpenGLWindow/Shaders/useShadowMapInstancingVS.h
+++ b/btgui/OpenGLWindow/Shaders/useShadowMapInstancingVS.h
@@ -2,8 +2,6 @@
 static const char* useShadowMapInstancingVertexShader= \
 "#version 330 \n"
 "precision highp float;\n"
-"\n"
-"\n"
 "layout (location = 0) in vec4 position;\n"
 "layout (location = 1) in vec4 instance_position;\n"
 "layout (location = 2) in vec4 instance_quaternion;\n"
@@ -11,26 +9,19 @@ static const char* useShadowMapInstancingVertexShader= \
 "layout (location = 4) in vec3 vertexnormal;\n"
 "layout (location = 5) in vec4 instance_color;\n"
 "layout (location = 6) in vec3 instance_scale;\n"
-"\n"
-"\n"
 "uniform mat4 ModelViewMatrix;\n"
 "uniform mat4 ProjectionMatrix;\n"
 "uniform mat4 DepthBiasModelViewProjectionMatrix;\n"
 "uniform mat4 MVP;\n"
-"\n"
 "out vec4 ShadowCoord;\n"
-"\n"
 "out Fragment\n"
 "{\n"
 "     vec4 color;\n"
 "} fragment;\n"
-"\n"
 "out Vert\n"
 "{\n"
 "	vec2 texcoord;\n"
 "} vert;\n"
-"\n"
-"\n"
 "vec4 quatMul ( in vec4 q1, in vec4 q2 )\n"
 "{\n"
 "    vec3  im = q1.w * q2.xyz + q1.xyz * q2.w + cross ( q1.xyz, q2.xyz );\n"
@@ -38,7 +29,6 @@ static const char* useShadowMapInstancingVertexShader= \
 "    float re = dot ( dt, vec4 ( -1.0, -1.0, -1.0, 1.0 ) );\n"
 "    return vec4 ( im, re );\n"
 "}\n"
-"\n"
 "vec4 quatFromAxisAngle(vec4 axis, in float angle)\n"
 "{\n"
 "    float cah = cos(angle*0.5);\n"
@@ -60,9 +50,7 @@ static const char* useShadowMapInstancingVertexShader= \
 "    vec4 temp = quatMul ( q, p );\n"
 "    return quatMul ( temp, vec4 ( -q.x, -q.y, -q.z, q.w ) );\n"
 "}\n"
-"\n"
 "out vec3 lightDir,normal,ambient;\n"
-"\n"
 "void main(void)\n"
 "{\n"
 "	vec4 q = instance_quaternion;\n"
@@ -72,18 +60,14 @@ static const char* useShadowMapInstancingVertexShader= \
 "	vec4 worldNormal = (quatRotate3( vertexnormal,q));\n"
 "	vec3 light_pos = vec3(-5.f,100,-40);\n"
 "	normal = normalize(worldNormal).xyz;\n"
-"\n"
 "	lightDir = normalize(light_pos);//gl_LightSource[0].position.xyz));\n"
 "		\n"
 "	vec4 axis = vec4(1,1,1,0);\n"
 "	vec4 localcoord = quatRotate3( position.xyz*instance_scale,q);\n"
 "	vec4 vertexPos = MVP* vec4((instance_position+localcoord).xyz,1);\n"
-"\n"
 "	gl_Position = vertexPos;\n"
 "	ShadowCoord = DepthBiasModelViewProjectionMatrix * vec4((instance_position+localcoord).xyz,1);\n"
-"\n"
 "	fragment.color = instance_color;\n"
 "	vert.texcoord = uvcoords;\n"
 "}\n"
-"\n"
 ;
--- a/build3/lcpp.lua
+++ b/build3/lcpp.lua
@@ -98,7 +98,7 @@ lcpp.LCPP_LUA         = false   -- whether to use lcpp to preprocess Lua code (l
 lcpp.LCPP_FFI         = true    -- whether to use lcpp as LuaJIT ffi PreProcessor (if used in luaJIT)
 lcpp.LCPP_TEST        = false   -- whether to run lcpp unit tests when loading lcpp module
 lcpp.ENV              = {}      -- static predefines (env-like)
-lcpp.FAST             = false   -- perf. tweaks when enabled. con: breaks minor stuff like __LINE__ macros
+lcpp.FAST             = true		-- perf. tweaks when enabled. con: breaks minor stuff like __LINE__ macros
 lcpp.DEBUG            = false

 -- PREDEFINES
@@ -438,7 +438,7 @@ local function processLine(state, line)

 	
 	--[[ APPLY MACROS ]]--
-	line = state:apply(line);
+	--line = state:apply(line);
 	
 	return line
 end
@@ -470,7 +470,7 @@ local function processLine2(state, line)
 				if elseif_ then state:elseBlock(state:parseExpr(elseif_))  end
 				if else_   then state:elseBlock(true)                      end
 				if endif   then state:closeBlock()                         end
-				return -- remove structural directives
+				return line
 			end
 		end
 	end
@@ -520,7 +520,7 @@ local function processLine2(state, line)
 					state:define(macroname, replacement)
 				end
 				
-				return
+				return line
 			end
 			
 			-- ignore, because we dont have any pragma directives yet
@@ -537,7 +537,7 @@ local function processLine2(state, line)

 	
 	--[[ APPLY MACROS ]]--
-	line = state:apply(line);
+	--line = state:apply(line);
 	
 	return line
 end
@@ -551,8 +551,11 @@ local function doWork(state)
 			local input = state:getLine()
 			if not input then break end
 			local output = processLine(state, input)
-			if not lcpp.FAST and not output then output = "" end -- output empty skipped lines
+			if not lcpp.FAST and not output then 
+				output = "" end -- output empty skipped lines
+				
 			if lcpp.DEBUG then output = output.." -- "..input end -- input as comment when DEBUG
+			
 			if output then coroutine.yield(output) end
 		end
 		if (oldIndent ~= state:getIndent()) then error("indentation level must be balanced within a file. was:"..oldIndent.." is:"..state:getIndent()) end
--- a/build3/premake4.lua
+++ b/build3/premake4.lua
@@ -116,7 +116,8 @@ if not _OPTIONS["ios"] then
 		include "../Demos3/GpuGuiInitialize"
 		
 		include "../test/OpenCL/BasicInitialize"
--		include "../test/OpenCL/BroadphaseCollision"
+		include "../test/OpenCL/KernelLaunch"--	
+		include "../test/OpenCL/BroadphaseCollision"
 --		include "../test/OpenCL/NarrowphaseCollision"
 		include "../test/OpenCL/ParallelPrimitives"
 		include "../test/OpenCL/RadixSortBenchmark"
@@ -149,6 +150,7 @@ if not _OPTIONS["ios"] then
 	end
 	

+
 	if _OPTIONS["bullet2gpu"] then
 		include "../src/LinearMath"	
 	include "../src/BulletCollision"	
--- a/src/Bullet3Collision/NarrowPhaseCollision/b3Contact4.h
+++ b/src/Bullet3Collision/NarrowPhaseCollision/b3Contact4.h
@@ -23,23 +23,6 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3Contact4 : public b3Contact4Data
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();

-	b3Vector3	m_worldPos[4];
-	b3Vector3	m_worldNormal;
-//	float m_restituitionCoeff;
-//	float m_frictionCoeff;
-	unsigned short  m_restituitionCoeffCmp;
-	unsigned short  m_frictionCoeffCmp;
-	int m_batchIdx;
-
-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-
-	int m_unused1;
-	int m_unused2;
-
 	int getBodyA()const {return abs(m_bodyAPtrAndSignBit);}
 	int getBodyB()const {return abs(m_bodyBPtrAndSignBit);}
 	bool isBodyAFixed()const { return m_bodyAPtrAndSignBit<0;}
--- a/src/Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h
+++ b/src/Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h
@@ -3,13 +3,16 @@

 #include "Bullet3Common/shared/b3Float4.h"

-typedef struct
+typedef  struct b3Contact4Data b3Contact4Data_t;
+
+struct b3Contact4Data
 {
 	b3Float4	m_worldPos[4];
+//	b3Float4	m_localPosB[4];
 	b3Float4	m_worldNormal;	//	w: m_nPoints
-	unsigned int  m_coeffs;
-	unsigned int m_batchIdx;
-
+	unsigned short  m_restituitionCoeffCmp;
+	unsigned short  m_frictionCoeffCmp;
+	int m_batchIdx;
 	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr
 	int m_bodyBPtrAndSignBit;

@@ -18,6 +21,19 @@ typedef struct
 	int m_unused1;
 	int m_unused2;

-} b3Contact4Data;
+	b3Float4	m_localPosA;
+};
+
+inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)
+{
+	return (int)contact->m_worldNormal.w;
+};
+
+inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)
+{
+	contact->m_worldNormal.w = (float)numPoints;
+};
+
+

 #endif //B3_CONTACT4DATA_H
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
@@ -2,22 +2,18 @@
 static const char* sapFastCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
 "#define NEW_PAIR_MARKER -1\n"
 "#define REMOVED_PAIR_MARKER -2\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	union\n"
@@ -33,7 +29,6 @@ static const char* sapFastCL= \
 "		int			m_maxIndices[4];\n"
 "	};\n"
 "} btAabbCL;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	union\n"
@@ -41,7 +36,6 @@ static const char* sapFastCL= \
 "		unsigned int m_key;\n"
 "		unsigned int x;\n"
 "	};\n"
-"\n"
 "	union\n"
 "	{\n"
 "		unsigned int m_value;\n"
@@ -49,8 +43,6 @@ static const char* sapFastCL= \
 "		\n"
 "	};\n"
 "}b3SortData;\n"
-"\n"
-"\n"
 "/// conservative test for overlap between two aabbs\n"
 "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
 "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
@@ -65,7 +57,6 @@ static const char* sapFastCL= \
 "	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
 "	return overlap;\n"
 "}\n"
-"\n"
 "__kernel void   computePairsIncremental3dSapKernel( __global const uint2* objectMinMaxIndexGPUaxis0,\n"
 "													__global const uint2* objectMinMaxIndexGPUaxis1,\n"
 "													__global const uint2* objectMinMaxIndexGPUaxis2,\n"
@@ -88,7 +79,6 @@ static const char* sapFastCL= \
 "	int i = get_global_id(0);\n"
 "	if (i>=numObjects)\n"
 "		return;\n"
-"\n"
 "	__global const uint2* objectMinMaxIndexGPU[3][2];\n"
 "	objectMinMaxIndexGPU[0][0]=objectMinMaxIndexGPUaxis0;\n"
 "	objectMinMaxIndexGPU[1][0]=objectMinMaxIndexGPUaxis1;\n"
@@ -96,7 +86,6 @@ static const char* sapFastCL= \
 "	objectMinMaxIndexGPU[0][1]=objectMinMaxIndexGPUaxis0prev;\n"
 "	objectMinMaxIndexGPU[1][1]=objectMinMaxIndexGPUaxis1prev;\n"
 "	objectMinMaxIndexGPU[2][1]=objectMinMaxIndexGPUaxis2prev;\n"
-"\n"
 "	__global const b3SortData* sortedAxisGPU[3][2];\n"
 "	sortedAxisGPU[0][0] = sortedAxisGPU0;\n"
 "	sortedAxisGPU[1][0] = sortedAxisGPU1;\n"
@@ -104,20 +93,16 @@ static const char* sapFastCL= \
 "	sortedAxisGPU[0][1] = sortedAxisGPU0prev;\n"
 "	sortedAxisGPU[1][1] = sortedAxisGPU1prev;\n"
 "	sortedAxisGPU[2][1] = sortedAxisGPU2prev;\n"
-"\n"
 "	int m_currentBuffer = 0;\n"
-"\n"
 "	for (int axis=0;axis<3;axis++)\n"
 "	{\n"
 "		//int i = checkObjects[a];\n"
-"\n"
 "		unsigned int curMinIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].x;\n"
 "		unsigned int curMaxIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].y;\n"
 "		unsigned int prevMinIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].x;\n"
 "		int dmin = curMinIndex - prevMinIndex;\n"
 "				\n"
 "		unsigned int prevMaxIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].y;\n"
-"\n"
 "		int dmax = curMaxIndex - prevMaxIndex;\n"
 "	\n"
 "		for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)\n"
@@ -132,23 +117,18 @@ static const char* sapFastCL= \
 "					if (otherIndex!=i)\n"
 "					{\n"
 "						bool otherIsMax = ((otherIndex2&1)!=0);\n"
-"\n"
 "						if (otherIsMax)\n"
 "						{\n"
 "									\n"
 "							bool overlap = true;\n"
-"\n"
 "							for (int ax=0;ax<3;ax++)\n"
 "							{\n"
 "								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||\n"
 "									(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))\n"
 "									overlap=false;\n"
 "							}\n"
-"\n"
 "						//	b3Assert(overlap2==overlap);\n"
-"\n"
 "							bool prevOverlap = true;\n"
-"\n"
 "							for (int ax=0;ax<3;ax++)\n"
 "							{\n"
 "								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||\n"
@@ -156,11 +136,8 @@ static const char* sapFastCL= \
 "									prevOverlap=false;\n"
 "							}\n"
 "									\n"
-"\n"
 "							//b3Assert(overlap==overlap2);\n"
 "								\n"
-"\n"
-"\n"
 "							if (dmin<0)\n"
 "							{\n"
 "								if (overlap && !prevOverlap)\n"
@@ -185,10 +162,8 @@ static const char* sapFastCL= \
 "											addedHostPairsGPU[curPair].y = newPair.y;\n"
 "											addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;\n"
 "											addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;\n"
-"\n"
 "										}\n"
 "									}\n"
-"\n"
 "								}\n"
 "							} \n"
 "							else\n"
@@ -216,7 +191,6 @@ static const char* sapFastCL= \
 "											removedHostPairsGPU[curPair].y = removedPair.y;\n"
 "											removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;\n"
 "											removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;\n"
-"\n"
 "										}\n"
 "									}\n"
 "								}\n"
@@ -240,7 +214,6 @@ static const char* sapFastCL= \
 "						{\n"
 "									\n"
 "							bool overlap = true;\n"
-"\n"
 "							for (int ax=0;ax<3;ax++)\n"
 "							{\n"
 "								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||\n"
@@ -248,9 +221,7 @@ static const char* sapFastCL= \
 "									overlap=false;\n"
 "							}\n"
 "							//b3Assert(overlap2==overlap);\n"
-"\n"
 "							bool prevOverlap = true;\n"
-"\n"
 "							for (int ax=0;ax<3;ax++)\n"
 "							{\n"
 "								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||\n"
@@ -258,7 +229,6 @@ static const char* sapFastCL= \
 "									prevOverlap=false;\n"
 "							}\n"
 "									\n"
-"\n"
 "							if (dmax>0)\n"
 "							{\n"
 "								if (overlap && !prevOverlap)\n"
@@ -283,7 +253,6 @@ static const char* sapFastCL= \
 "											addedHostPairsGPU[curPair].y = newPair.y;\n"
 "											addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;\n"
 "											addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;\n"
-"\n"
 "										}\n"
 "									}\n"
 "							\n"
@@ -326,16 +295,12 @@ static const char* sapFastCL= \
 "			}\n"
 "		}//for (int otherbuffer\n"
 "	}//for (int axis=0;\n"
-"\n"
-"\n"
 "}\n"
-"\n"
 "//computePairsKernelBatchWrite\n"
 "__kernel void   computePairsKernel( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	int localId = get_local_id(0);\n"
-"\n"
 "	__local int numActiveWgItems[1];\n"
 "	__local int breakRequest[1];\n"
 "	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
@@ -411,7 +376,6 @@ static const char* sapFastCL= \
 "							tmpPair.y = myPairs[p].y;\n"
 "							tmpPair.z = NEW_PAIR_MARKER;\n"
 "							tmpPair.w = NEW_PAIR_MARKER;\n"
-"\n"
 "							pairsOut[curPair+p] = tmpPair; //flush to main memory\n"
 "						}\n"
 "					}\n"
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@@ -2,21 +2,17 @@
 static const char* sapCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
 "#define NEW_PAIR_MARKER -1\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	union\n"
@@ -32,8 +28,6 @@ static const char* sapCL= \
 "		int			m_maxIndices[4];\n"
 "	};\n"
 "} btAabbCL;\n"
-"\n"
-"\n"
 "/// conservative test for overlap between two aabbs\n"
 "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
 "bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
@@ -53,7 +47,6 @@ static const char* sapCL= \
 "	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
 "	return overlap;\n"
 "}\n"
-"\n"
 "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
 "bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
 "{\n"
@@ -63,18 +56,14 @@ static const char* sapCL= \
 "	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
 "	return overlap;\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numUnsortedAabbs)\n"
 "		return;\n"
-"\n"
 "	int j = get_global_id(1);\n"
 "	if (j>=numSortedAabbs)\n"
 "		return;\n"
-"\n"
 "	if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
 "	{\n"
 "		int4 myPair;\n"
@@ -92,8 +81,6 @@ static const char* sapCL= \
 "		myPair.y = yIndex;\n"
 "		myPair.z = NEW_PAIR_MARKER;\n"
 "		myPair.w = NEW_PAIR_MARKER;\n"
-"\n"
-"\n"
 "		int curPair = atomic_inc (pairCount);\n"
 "		if (curPair<maxPairs)\n"
 "		{\n"
@@ -101,7 +88,6 @@ static const char* sapCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
@@ -120,7 +106,6 @@ static const char* sapCL= \
 "			myPair.y = aabbs[j].m_minIndices[3];\n"
 "			myPair.z = NEW_PAIR_MARKER;\n"
 "			myPair.w = NEW_PAIR_MARKER;\n"
-"\n"
 "			int curPair = atomic_inc (pairCount);\n"
 "			if (curPair<maxPairs)\n"
 "			{\n"
@@ -129,18 +114,12 @@ static const char* sapCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	int localId = get_local_id(0);\n"
-"\n"
 "	__local int numActiveWgItems[1];\n"
 "	__local int breakRequest[1];\n"
-"\n"
 "	if (localId==0)\n"
 "	{\n"
 "		numActiveWgItems[0] = 0;\n"
@@ -150,7 +129,6 @@ static const char* sapCL= \
 "	atomic_inc(numActiveWgItems);\n"
 "	barrier(CLK_LOCAL_MEM_FENCE);\n"
 "	int localBreak = 0;\n"
-"\n"
 "	int j=i+1;\n"
 "	do\n"
 "	{\n"
@@ -186,7 +164,6 @@ static const char* sapCL= \
 "				myPair.y = aabbs[j].m_minIndices[3];\n"
 "				myPair.z = NEW_PAIR_MARKER;\n"
 "				myPair.w = NEW_PAIR_MARKER;\n"
-"\n"
 "				int curPair = atomic_inc (pairCount);\n"
 "				if (curPair<maxPairs)\n"
 "				{\n"
@@ -195,16 +172,12 @@ static const char* sapCL= \
 "			}\n"
 "		}\n"
 "		j++;\n"
-"\n"
 "	} while (breakRequest[0]<numActiveWgItems[0]);\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	int localId = get_local_id(0);\n"
-"\n"
 "	__local int numActiveWgItems[1];\n"
 "	__local int breakRequest[1];\n"
 "	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
@@ -264,7 +237,6 @@ static const char* sapCL= \
 "				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
 "				myPair.z = NEW_PAIR_MARKER;\n"
 "				myPair.w = NEW_PAIR_MARKER;\n"
-"\n"
 "				int curPair = atomic_inc (pairCount);\n"
 "				if (curPair<maxPairs)\n"
 "				{\n"
@@ -274,7 +246,6 @@ static const char* sapCL= \
 "		}\n"
 "		\n"
 "		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"\n"
 "		localCount++;\n"
 "		if (localCount==64)\n"
 "		{\n"
@@ -288,10 +259,6 @@ static const char* sapCL= \
 "	} while (breakRequest[0]<numActiveWgItems[0]);\n"
 "	\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "//http://stereopsis.com/radix.html\n"
 "unsigned int FloatFlip(float fl);\n"
 "unsigned int FloatFlip(float fl)\n"
@@ -307,10 +274,6 @@ static const char* sapCL= \
 "	unsigned int fl = f ^ mask;\n"
 "	return *(float*)&fl;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
@@ -320,8 +283,6 @@ static const char* sapCL= \
 "	destAabbs[i] = allAabbs[src];\n"
 "	destAabbs[i].m_maxIndices[3] = src;\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void   flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
@@ -332,19 +293,13 @@ static const char* sapCL= \
 "		sortData[i].y = i;\n"
 "		\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void   scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numObjects)\n"
 "		return;\n"
-"\n"
 "		sortedAabbs[i] = aabbs[sortData[i].y];\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__kernel void   prepareSumVarianceKernel( __global const btAabbCL* aabbs, __global float4* sum, __global float4* sum2,int numAabbs)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
@@ -1,3 +1,4 @@
+
 #if 0
 /*
 Bullet Continuous Collision Detection and Physics Library
@@ -18,77 +19,22 @@ subject to the following restrictions:
 #include "b3ContactCache.h"
 #include "Bullet3Common/b3Transform.h"

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"

 b3Scalar					gContactBreakingThreshold = b3Scalar(0.02);
-b3Scalar					m_contactBreakingThreshold;
-b3Scalar					m_contactProcessingThreshold;

 ///gContactCalcArea3Points will approximate the convex hull area using 3 points
 ///when setting it to false, it will use 4 points to compute the area: it is more accurate but slower
 bool						gContactCalcArea3Points = true;


-b3ContactCache::b3ContactCache()
-:m_index1a(0)
-{
-}


-
-
-#ifdef DEBUG_PERSISTENCY
-#include <stdio.h>
-void	b3ContactCache::DebugPersistency()
-{
-	int i;
-	printf("DebugPersistency : numPoints %d\n",m_cachedPoints);
-	for (i=0;i<m_cachedPoints;i++)
-	{
-		printf("m_pointCache[%d].m_userPersistentData = %x\n",i,m_pointCache[i].m_userPersistentData);
-	}
-}
-#endif //DEBUG_PERSISTENCY
-
-void b3ContactCache::clearUserCache(btManifoldPoint& pt)
-{
-
-	void* oldPtr = pt.m_userPersistentData;
-	if (oldPtr)
-	{
-#ifdef DEBUG_PERSISTENCY
-		int i;
-		int occurance = 0;
-		for (i=0;i<m_cachedPoints;i++)
-		{
-			if (m_pointCache[i].m_userPersistentData == oldPtr)
-			{
-				occurance++;
-				if (occurance>1)
-					printf("error in clearUserCache\n");
-			}
-		}
-		btAssert(occurance<=0);
-#endif //DEBUG_PERSISTENCY
-
-		if (pt.m_userPersistentData && gContactDestroyedCallback)
-		{
-			(*gContactDestroyedCallback)(pt.m_userPersistentData);
-			pt.m_userPersistentData = 0;
-		}
-		
-#ifdef DEBUG_PERSISTENCY
-		DebugPersistency();
-#endif
-	}
-
-	
-}
-
-static inline b3Scalar calcArea4Points(const btVector3 &p0,const btVector3 &p1,const btVector3 &p2,const btVector3 &p3)
+static inline b3Scalar calcArea4Points(const b3Vector3 &p0,const b3Vector3 &p1,const b3Vector3 &p2,const b3Vector3 &p3)
 {
 	// It calculates possible 3 area constructed from random 4 points and returns the biggest one.

-	btVector3 a[3],b[3];
+	b3Vector3 a[3],b[3];
 	a[0] = p0 - p1;
 	a[1] = p0 - p2;
 	a[2] = p0 - p3;
@@ -97,14 +43,16 @@ static inline b3Scalar calcArea4Points(const btVector3 &p0,const btVector3 &p1,c
 	b[2] = p1 - p2;

 	//todo: Following 3 cross production can be easily optimized by SIMD.
-	btVector3 tmp0 = a[0].cross(b[0]);
-	btVector3 tmp1 = a[1].cross(b[1]);
-	btVector3 tmp2 = a[2].cross(b[2]);
+	b3Vector3 tmp0 = a[0].cross(b[0]);
+	b3Vector3 tmp1 = a[1].cross(b[1]);
+	b3Vector3 tmp2 = a[2].cross(b[2]);

-	return btMax(btMax(tmp0.length2(),tmp1.length2()),tmp2.length2());
+	return b3Max(b3Max(tmp0.length2(),tmp1.length2()),tmp2.length2());
 }
+#if 0

-int b3ContactCache::sortCachedPoints(const btManifoldPoint& pt) 
+//using localPointA for all points
+int b3ContactCache::sortCachedPoints(const b3Vector3& pt) 
 {
 		//calculate 4 possible cases areas, and take biggest area
 		//also need to keep 'deepest'
@@ -129,32 +77,32 @@ int b3ContactCache::sortCachedPoints(const btManifoldPoint& pt)
 	{
 		if (maxPenetrationIndex != 0)
 		{
-			btVector3 a0 = pt.m_localPointA-m_pointCache[1].m_localPointA;
-			btVector3 b0 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
-			btVector3 cross = a0.cross(b0);
+			b3Vector3 a0 = pt.m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 b0 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
+			b3Vector3 cross = a0.cross(b0);
 			res0 = cross.length2();
 		}
 		if (maxPenetrationIndex != 1)
 		{
-			btVector3 a1 = pt.m_localPointA-m_pointCache[0].m_localPointA;
-			btVector3 b1 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
-			btVector3 cross = a1.cross(b1);
+			b3Vector3 a1 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b1 = m_pointCache[3].m_localPointA-m_pointCache[2].m_localPointA;
+			b3Vector3 cross = a1.cross(b1);
 			res1 = cross.length2();
 		}

 		if (maxPenetrationIndex != 2)
 		{
-			btVector3 a2 = pt.m_localPointA-m_pointCache[0].m_localPointA;
-			btVector3 b2 = m_pointCache[3].m_localPointA-m_pointCache[1].m_localPointA;
-			btVector3 cross = a2.cross(b2);
+			b3Vector3 a2 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b2 = m_pointCache[3].m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 cross = a2.cross(b2);
 			res2 = cross.length2();
 		}

 		if (maxPenetrationIndex != 3)
 		{
-			btVector3 a3 = pt.m_localPointA-m_pointCache[0].m_localPointA;
-			btVector3 b3 = m_pointCache[2].m_localPointA-m_pointCache[1].m_localPointA;
-			btVector3 cross = a3.cross(b3);
+			b3Vector3 a3 = pt.m_localPointA-m_pointCache[0].m_localPointA;
+			b3Vector3 b3 = m_pointCache[2].m_localPointA-m_pointCache[1].m_localPointA;
+			b3Vector3 cross = a3.cross(b3);
 			res3 = cross.length2();
 		}
 	} 
@@ -176,23 +124,23 @@ int b3ContactCache::sortCachedPoints(const btManifoldPoint& pt)
 			res3 = calcArea4Points(pt.m_localPointA,m_pointCache[0].m_localPointA,m_pointCache[1].m_localPointA,m_pointCache[2].m_localPointA);
 		}
 	}
-	btVector4 maxvec(res0,res1,res2,res3);
+	b3Vector4 maxvec(res0,res1,res2,res3);
 	int biggestarea = maxvec.closestAxis4();
 	return biggestarea;
 	
 }


-int b3ContactCache::getCacheEntry(const btManifoldPoint& newPoint) const
+int b3ContactCache::getCacheEntry(const b3Vector3& newPoint) const
 {
 	b3Scalar shortestDist =  getContactBreakingThreshold() * getContactBreakingThreshold();
 	int size = getNumContacts();
 	int nearestPoint = -1;
 	for( int i = 0; i < size; i++ )
 	{
-		const btManifoldPoint &mp = m_pointCache[i];
+		const b3Vector3 &mp = m_pointCache[i];

-		btVector3 diffA =  mp.m_localPointA- newPoint.m_localPointA;
+		b3Vector3 diffA =  mp.m_localPointA- newPoint.m_localPointA;
 		const b3Scalar distToManiPoint = diffA.dot(diffA);
 		if( distToManiPoint < shortestDist )
 		{
@@ -203,12 +151,9 @@ int b3ContactCache::getCacheEntry(const btManifoldPoint& newPoint) const
 	return nearestPoint;
 }

-int b3ContactCache::addManifoldPoint(const btManifoldPoint& newPoint, bool isPredictive)
+int b3ContactCache::addManifoldPoint(const b3Vector3& newPoint)
 {
-	if (!isPredictive)
-	{
-		btAssert(validContactDistance(newPoint));
-	}
+	b3Assert(validContactDistance(newPoint));
 	
 	int insertIndex = getNumContacts();
 	if (insertIndex == MANIFOLD_CACHE_SIZE)
@@ -230,74 +175,81 @@ int b3ContactCache::addManifoldPoint(const btManifoldPoint& newPoint, bool isPre
 	if (insertIndex<0)
 		insertIndex=0;

-	btAssert(m_pointCache[insertIndex].m_userPersistentData==0);
+	//b3Assert(m_pointCache[insertIndex].m_userPersistentData==0);
 	m_pointCache[insertIndex] = newPoint;
 	return insertIndex;
 }

-b3Scalar	b3ContactCache::getContactBreakingThreshold() const
+#endif
+
+bool b3ContactCache::validContactDistance(const b3Vector3& pt)
 {
-	return m_contactBreakingThreshold;
+	return pt.w <= gContactBreakingThreshold;
 }

-
-
-void b3ContactCache::refreshContactPoints(const btTransform& trA,const btTransform& trB)
+void b3ContactCache::removeContactPoint(struct b3Contact4Data& newContactCache,int i)
 {
+	int numContacts = b3Contact4Data_getNumPoints(&newContactCache);
+	if (i!=(numContacts-1))
+	{
+		b3Swap(newContactCache.m_localPosA[i],newContactCache.m_localPosA[numContacts-1]);
+		b3Swap(newContactCache.m_localPosB[i],newContactCache.m_localPosB[numContacts-1]);
+		b3Swap(newContactCache.m_worldPos[i],newContactCache.m_worldPos[numContacts-1]);
+	}
+	b3Contact4Data_setNumPoints(&newContactCache,numContacts-1);
+
+}
+
+void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& contacts)
+{
+
+	int numContacts = b3Contact4Data_getNumPoints(&contacts);
+	
+
 	int i;
-#ifdef DEBUG_PERSISTENCY
-	printf("refreshContactPoints posA = (%f,%f,%f) posB = (%f,%f,%f)\n",
-		trA.getOrigin().getX(),
-		trA.getOrigin().getY(),
-		trA.getOrigin().getZ(),
-		trB.getOrigin().getX(),
-		trB.getOrigin().getY(),
-		trB.getOrigin().getZ());
-#endif //DEBUG_PERSISTENCY
 	/// first refresh worldspace positions and distance
-	for (i=getNumContacts()-1;i>=0;i--)
+	for (i=numContacts-1;i>=0;i--)
 	{
-		btManifoldPoint &manifoldPoint = m_pointCache[i];
-		manifoldPoint.m_positionWorldOnA = trA( manifoldPoint.m_localPointA );
-		manifoldPoint.m_positionWorldOnB = trB( manifoldPoint.m_localPointB );
-		manifoldPoint.m_distance1 = (manifoldPoint.m_positionWorldOnA -  manifoldPoint.m_positionWorldOnB).dot(manifoldPoint.m_normalWorldOnB);
-		manifoldPoint.m_lifeTime++;
+		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
+		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
+		contacts.m_worldPos[i] = worldPosB;
+		float distance = (worldPosA -  worldPosB).dot(contacts.m_worldNormal);
+		contacts.m_worldPos[i].w = distance;
 	}

 	/// then 
 	b3Scalar distance2d;
-	btVector3 projectedDifference,projectedPoint;
-	for (i=getNumContacts()-1;i>=0;i--)
+	b3Vector3 projectedDifference,projectedPoint;
+	for (i=numContacts-1;i>=0;i--)
 	{
-		
-		btManifoldPoint &manifoldPoint = m_pointCache[i];
+		b3Vector3 worldPosA = trA( contacts.m_localPosA[i]);
+		b3Vector3 worldPosB = trB( contacts.m_localPosB[i]);
+		b3Vector3&pt = contacts.m_worldPos[i];
 		//contact becomes invalid when signed distance exceeds margin (projected on contactnormal direction)
-		if (!validContactDistance(manifoldPoint))
+		if (!validContactDistance(pt))
 		{
-			removeContactPoint(i);
+			removeContactPoint(contacts,i);
 		} else
 		{
 			//contact also becomes invalid when relative movement orthogonal to normal exceeds margin
-			projectedPoint = manifoldPoint.m_positionWorldOnA - manifoldPoint.m_normalWorldOnB * manifoldPoint.m_distance1;
-			projectedDifference = manifoldPoint.m_positionWorldOnB - projectedPoint;
+			projectedPoint = contacts.m_worldPos[i] - contacts.m_worldNormal * contacts.m_worldPos[i].w;
+			projectedDifference = contacts.m_worldPos[i] - projectedPoint;
 			distance2d = projectedDifference.dot(projectedDifference);
-			if (distance2d  > getContactBreakingThreshold()*getContactBreakingThreshold() )
+			if (distance2d  > gContactBreakingThreshold*gContactBreakingThreshold )
 			{
-				removeContactPoint(i);
+				removeContactPoint(contacts,i);
 			} else
 			{
-				//contact point processed callback
-				if (gContactProcessedCallback)
-					(*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
+				////contact point processed callback
+				//if (gContactProcessedCallback)
+				//	(*gContactProcessedCallback)(manifoldPoint,(void*)m_body0,(void*)m_body1);
 			}
 		}
 	}
-#ifdef DEBUG_PERSISTENCY
-	DebugPersistency();
-#endif //
-}
 	

+}
+



--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
@@ -1,7 +1,7 @@

 /*
 Bullet Continuous Collision Detection and Physics Library
-Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+Copyright (c) 2003-2013 Erwin Coumans  http://bulletphysics.org

 This software is provided 'as-is', without any express or implied warranty.
 In no event will the authors be held liable for any damages arising from the use of this software.
@@ -18,23 +18,15 @@ subject to the following restrictions:
 #define B3_CONTACT_CACHE_H


-#include "LinearMath/btVector3.h"
-#include "LinearMath/btTransform.h"
-#include "btManifoldPoint.h"
-class btCollisionObject;
-#include "LinearMath/btAlignedAllocator.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3Common/b3Transform.h"
+#include "Bullet3Common/b3AlignedAllocator.h"

-struct btCollisionResult;

 ///maximum contact breaking and merging threshold
 extern b3Scalar gContactBreakingThreshold;

-//the enum starts at 1024 to avoid type conflicts with btTypedConstraint
-enum btContactManifoldTypes
-{
-	MIN_CONTACT_MANIFOLD_TYPE = 1024,
-	BT_PERSISTENT_MANIFOLD_TYPE
-};
+

 #define MANIFOLD_CACHE_SIZE 4

@@ -45,8 +37,6 @@ enum btContactManifoldTypes
 ///reduces the cache to 4 points, when more then 4 points are added, using following rules:
 ///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
 ///note that some pairs of objects might have more then one contact manifold.
-
-
 B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache
 {

@@ -54,163 +44,36 @@ B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache

 	
 	/// sort cached points so most isolated points come first
-	int	sortCachedPoints(const btManifoldPoint& pt);
+	int	sortCachedPoints(const b3Vector3& pt);
+
 	
-	int		findContactPoint(const btManifoldPoint* unUsed, int numUnused,const btManifoldPoint& pt);

 public:

-	BT_DECLARE_ALIGNED_ALLOCATOR();
+	B3_DECLARE_ALIGNED_ALLOCATOR();

 	
-	int m_index1a;
 	
-	b3ContactCache();
+	int addManifoldPoint( const b3Vector3& newPoint);

-	b3ContactCache(const btCollisionObject* body0,const btCollisionObject* body1,int , b3Scalar contactBreakingThreshold,b3Scalar contactProcessingThreshold)
-		: btTypedObject(BT_PERSISTENT_MANIFOLD_TYPE),
-	m_body0(body0),m_body1(body1),m_cachedPoints(0),
-		m_contactBreakingThreshold(contactBreakingThreshold),
-		m_contactProcessingThreshold(contactProcessingThreshold)
+	/*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex)
 	{
-	}
-
-	B3_FORCE_INLINE const btCollisionObject* getBody0() const { return m_body0;}
-	B3_FORCE_INLINE const btCollisionObject* getBody1() const { return m_body1;}
-
-	void	setBodies(const btCollisionObject* body0,const btCollisionObject* body1)
-	{
-		m_body0 = body0;
-		m_body1 = body1;
-	}
-
-	void clearUserCache(btManifoldPoint& pt);
-
-#ifdef DEBUG_PERSISTENCY
-	void	DebugPersistency();
-#endif //
-	
-	B3_FORCE_INLINE int	getNumContacts() const { return m_cachedPoints;}
-	/// the setNumContacts API is usually not used, except when you gather/fill all contacts manually
-	void setNumContacts(int cachedPoints)
-	{
-		m_cachedPoints = cachedPoints;
-	}
-
-
-	B3_FORCE_INLINE const btManifoldPoint& getContactPoint(int index) const
-	{
-		btAssert(index < m_cachedPoints);
-		return m_pointCache[index];
-	}
-
-	B3_FORCE_INLINE btManifoldPoint& getContactPoint(int index)
-	{
-		btAssert(index < m_cachedPoints);
-		return m_pointCache[index];
-	}
-
-	
-	void setContactBreakingThreshold(b3Scalar contactBreakingThreshold)
-	{
-		m_contactBreakingThreshold = contactBreakingThreshold;
-	}
-
-	void setContactProcessingThreshold(b3Scalar	contactProcessingThreshold)
-	{
-		m_contactProcessingThreshold = contactProcessingThreshold;
-	}
-	
-	
-
-
-	int getCacheEntry(const btManifoldPoint& newPoint) const;
-
-	int addManifoldPoint( const btManifoldPoint& newPoint, bool isPredictive=false);
-
-	void removeContactPoint (int index)
-	{
-		clearUserCache(m_pointCache[index]);
-
-		int lastUsedIndex = getNumContacts() - 1;
-//		m_pointCache[index] = m_pointCache[lastUsedIndex];
-		if(index != lastUsedIndex) 
-		{
-			m_pointCache[index] = m_pointCache[lastUsedIndex]; 
-			//get rid of duplicated userPersistentData pointer
-			m_pointCache[lastUsedIndex].m_userPersistentData = 0;
-			m_pointCache[lastUsedIndex].m_appliedImpulse = 0.f;
-			m_pointCache[lastUsedIndex].m_lateralFrictionInitialized = false;
-			m_pointCache[lastUsedIndex].m_appliedImpulseLateral1 = 0.f;
-			m_pointCache[lastUsedIndex].m_appliedImpulseLateral2 = 0.f;
-			m_pointCache[lastUsedIndex].m_lifeTime = 0;
-		}
-
-		btAssert(m_pointCache[lastUsedIndex].m_userPersistentData==0);
-		m_cachedPoints--;
-	}
-	void replaceContactPoint(const btManifoldPoint& newPoint,int insertIndex)
-	{
-		btAssert(validContactDistance(newPoint));
-
-#define MAINTAIN_PERSISTENCY 1
-#ifdef MAINTAIN_PERSISTENCY
-		int	lifeTime = m_pointCache[insertIndex].getLifeTime();
-		b3Scalar	appliedImpulse = m_pointCache[insertIndex].m_appliedImpulse;
-		b3Scalar	appliedLateralImpulse1 = m_pointCache[insertIndex].m_appliedImpulseLateral1;
-		b3Scalar	appliedLateralImpulse2 = m_pointCache[insertIndex].m_appliedImpulseLateral2;
-//		bool isLateralFrictionInitialized = m_pointCache[insertIndex].m_lateralFrictionInitialized;
-		
-		
-			
-		btAssert(lifeTime>=0);
-		void* cache = m_pointCache[insertIndex].m_userPersistentData;
-		
+		b3Assert(validContactDistance(newPoint));
 		m_pointCache[insertIndex] = newPoint;
-
-		m_pointCache[insertIndex].m_userPersistentData = cache;
-		m_pointCache[insertIndex].m_appliedImpulse = appliedImpulse;
-		m_pointCache[insertIndex].m_appliedImpulseLateral1 = appliedLateralImpulse1;
-		m_pointCache[insertIndex].m_appliedImpulseLateral2 = appliedLateralImpulse2;
-		
-		m_pointCache[insertIndex].m_appliedImpulse =  appliedImpulse;
-		m_pointCache[insertIndex].m_appliedImpulseLateral1 = appliedLateralImpulse1;
-		m_pointCache[insertIndex].m_appliedImpulseLateral2 = appliedLateralImpulse2;
-
-
-		m_pointCache[insertIndex].m_lifeTime = lifeTime;
-#else
-		clearUserCache(m_pointCache[insertIndex]);
-		m_pointCache[insertIndex] = newPoint;
-	
-#endif
 	}
+	*/


-	bool validContactDistance(const btManifoldPoint& pt) const
-	{
-		return pt.m_distance1 <= getContactBreakingThreshold();
-	}
+	
+	static bool validContactDistance(const b3Vector3& pt);
+	
 	/// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin
-	void	refreshContactPoints(  const btTransform& trA,const btTransform& trB);
-
-	
-	B3_FORCE_INLINE	void	clearManifold()
-	{
-		int i;
-		for (i=0;i<m_cachedPoints;i++)
-		{
-			clearUserCache(m_pointCache[i]);
-		}
-		m_cachedPoints = 0;
-	}
-
-
-
-}
-;
+	static void	refreshContactPoints(  const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& newContactCache);
+
+	static void removeContactPoint(struct b3Contact4Data& newContactCache,int i);
 	

+};



--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
@@ -19,13 +19,14 @@ subject to the following restrictions:
 ///And contact clipping based on work from Simon Hobbs

 //#define B3_DEBUG_SAT_FACE
+//#define CHECK_ON_HOST

 int b3g_actualSATPairTests=0;

 #include "b3ConvexHullContact.h"
 #include <string.h>//memcpy
 #include "b3ConvexPolyhedronCL.h"
-
+#include "Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h"

 typedef b3AlignedObjectArray<b3Vector3> b3VertexArray;

@@ -1603,7 +1604,7 @@ int computeContactConvexConvex( b3AlignedObjectArray<b3Int4>& pairs,
 	transB.setRotation(rigidBodies[bodyIndexB].m_quat);
 	float maximumDistanceSquared = 1e30f;
 					
-	b3Vector3 resultPointOnB;
+	b3Vector3 resultPointOnBWorld;
 	b3Vector3 sepAxis2(0,1,0);
 	b3Scalar distance2 = 1e30f;
 	
@@ -1618,7 +1619,7 @@ int computeContactConvexConvex( b3AlignedObjectArray<b3Int4>& pairs,
 		maximumDistanceSquared,
 		sepAxis2,
 		distance2,
-		resultPointOnB);
+		resultPointOnBWorld);
 	
 	
 	if (result2)
@@ -1627,31 +1628,58 @@ int computeContactConvexConvex( b3AlignedObjectArray<b3Int4>& pairs,
 		{
 			contactIndex = nGlobalContactsOut;
 			globalContactsOut.expand();
-			b3Contact4& contact = globalContactsOut.at(nGlobalContactsOut);
-			contact.m_batchIdx = 0;//i;
-			contact.m_bodyAPtrAndSignBit = (rigidBodies.at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
-			contact.m_bodyBPtrAndSignBit = (rigidBodies.at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;
+			b3Contact4& newContact = globalContactsOut.at(nGlobalContactsOut);
+			newContact.m_batchIdx = 0;//i;
+			newContact.m_bodyAPtrAndSignBit = (rigidBodies.at(bodyIndexA).m_invMass==0)? -bodyIndexA:bodyIndexA;
+			newContact.m_bodyBPtrAndSignBit = (rigidBodies.at(bodyIndexB).m_invMass==0)? -bodyIndexB:bodyIndexB;

-			contact.m_frictionCoeffCmp = 45874;
-			contact.m_restituitionCoeffCmp = 0;
+			newContact.m_frictionCoeffCmp = 45874;
+			newContact.m_restituitionCoeffCmp = 0;
 					
 			
-			int numPoints = 1;
-			if (pairs[pairIndex].z>=0)
+			int numPoints = 0;
+			if (0)//pairs[pairIndex].z>=0)
 			{
-				printf("add existing points?\n");
+				//printf("add existing points?\n");
+				//refresh
+				
+				int numOldPoints = oldContacts[pairs[pairIndex].z].getNPoints();
+				if (numOldPoints)
+				{
+					newContact = oldContacts[pairs[pairIndex].z];
+					//b3ContactCache::refreshContactPoints(transA,transB,newContact);
+				}
+				numPoints = b3Contact4Data_getNumPoints(&newContact);

 			}
-			for (int p=0;p<numPoints;p++)
+
+			/*
+			int insertIndex = m_manifoldPtr->getCacheEntry(newPt);
+				if (insertIndex >= 0)
 				{
-				resultPointOnB.w = distance2;
+					//const btManifoldPoint& oldPoint = m_manifoldPtr->getContactPoint(insertIndex);
+					m_manifoldPtr->replaceContactPoint(newPt,insertIndex);
+				} else
+				{
+					insertIndex = m_manifoldPtr->addManifoldPoint(newPt);
+				}
+			*/
 			
-				contact.m_worldPos[p] = resultPointOnB;
-				
-				contact.m_worldNormal = -sepAxis2; 
+			int p=numPoints;
+			if (numPoints<3)
+			{
+				numPoints++;
+			}
+			{
+				resultPointOnBWorld.w = distance2;
+				newContact.m_worldPos[p] = resultPointOnBWorld;
+				b3Vector3 resultPointOnAWorld = resultPointOnBWorld+distance2*sepAxis2;
+				//newContact.m_localPosA[p] = transA.inverse()*resultPointOnAWorld;
+			//	newContact.m_localPosB[p] = transB.inverse()*resultPointOnBWorld;
+				newContact.m_worldNormal = sepAxis2; 
 			}
 			//printf("bodyIndexA %d,bodyIndexB %d,normal=%f,%f,%f numPoints %d\n",bodyIndexA,bodyIndexB,normalOnSurfaceB.x,normalOnSurfaceB.y,normalOnSurfaceB.z,numPoints);
-			contact.m_worldNormal.w = (b3Scalar)numPoints;
+			newContact.m_worldNormal.w = (b3Scalar)numPoints;
 			nGlobalContactsOut++;
 		} else
 		{
@@ -1797,7 +1825,7 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>*
 		return;


-//#define CHECK_ON_HOST
+
 #ifdef CHECK_ON_HOST
 	b3AlignedObjectArray<b3YetAnotherAabb> hostAabbs;
 	clAabbsWS.copyToHost(hostAabbs);
@@ -1909,9 +1937,12 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>*
 			hostCollidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)
 		{
 			//printf("hostPairs[i].z=%d\n",hostPairs[i].z);
-			int contactIndex = computeContactConvexConvex(hostPairs,i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf,
-					hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,
-					oldHostContacts);
+			int contactIndex = computeContactConvexConvex2(i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf,
+					hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,oldHostContacts);
+			//int contactIndex = computeContactConvexConvex(hostPairs,i,bodyIndexA,bodyIndexB,collidableIndexA,collidableIndexB,hostBodyBuf,
+			//		hostCollidables,hostConvexData,hostVertices,hostUniqueEdges,hostIndices,hostFaces,hostContacts,nContacts,maxContactCapacity,
+			//		oldHostContacts);
+

 			if (contactIndex>=0)
 			{
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
@@ -2,17 +2,13 @@
 static const char* bvhTraversalKernelCL= \
 "//keep this enum in sync with the CPU version (in btCollidable.h)\n"
 "//written by Erwin Coumans\n"
-"\n"
 "#define SHAPE_CONVEX_HULL 3\n"
 "#define SHAPE_CONCAVE_TRIMESH 5\n"
 "#define TRIANGLE_NUM_CONVEX_FACES 5\n"
 "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
 "#define SHAPE_SPHERE 7\n"
-"\n"
 "typedef unsigned int u32;\n"
-"\n"
 "#define MAX_NUM_PARTS_IN_BITS 10\n"
-"\n"
 "///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
 "///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
 "typedef struct\n"
@@ -23,7 +19,6 @@ static const char* bvhTraversalKernelCL= \
 "	//4 bytes\n"
 "	int	m_escapeIndexOrTriangleIndex;\n"
 "} btQuantizedBvhNode;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4		m_aabbMin;\n"
@@ -33,9 +28,7 @@ static const char* bvhTraversalKernelCL= \
 "	int			m_numSubTrees;\n"
 "	int			m_nodeOffset;\n"
 "	int			m_subTreeOffset;\n"
-"\n"
 "} b3BvhInfo;\n"
-"\n"
 "/*\n"
 "	bool isLeafNode() const\n"
 "	{\n"
@@ -62,7 +55,6 @@ static const char* bvhTraversalKernelCL= \
 "		return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));\n"
 "	}\n"
 "*/\n"
-"\n"
 "int	getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
 "{\n"
 "	unsigned int x=0;\n"
@@ -70,7 +62,6 @@ static const char* bvhTraversalKernelCL= \
 "	// Get only the lower bits where the triangle index is stored\n"
 "	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
 "}\n"
-"\n"
 "int isLeaf(const btQuantizedBvhNode* rootNode)\n"
 "{\n"
 "	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
@@ -81,7 +72,6 @@ static const char* bvhTraversalKernelCL= \
 "{\n"
 "	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
 "}\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	//12 bytes\n"
@@ -93,7 +83,6 @@ static const char* bvhTraversalKernelCL= \
 "	int			m_subtreeSize;\n"
 "	int			m_padding[3];\n"
 "} btBvhSubtreeInfo;\n"
-"\n"
 "///keep this in sync with btCollidable.h\n"
 "typedef struct\n"
 "{\n"
@@ -103,7 +92,6 @@ static const char* bvhTraversalKernelCL= \
 "	int m_shapeIndex;\n"
 "	\n"
 "} btCollidableGpu;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4	m_childPosition;\n"
@@ -113,21 +101,17 @@ static const char* bvhTraversalKernelCL= \
 "	int m_unused1;\n"
 "	int m_unused2;\n"
 "} btGpuChildShape;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_collidableIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} BodyData;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	union\n"
@@ -143,8 +127,6 @@ static const char* bvhTraversalKernelCL= \
 "		int			m_maxIndices[4];\n"
 "	};\n"
 "} btAabbCL;\n"
-"\n"
-"\n"
 "int testQuantizedAabbAgainstQuantizedAabb(\n"
 "								const unsigned short int* aabbMin1,\n"
 "								const unsigned short int* aabbMax1,\n"
@@ -170,13 +152,10 @@ static const char* bvhTraversalKernelCL= \
 "	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
 "	//return overlap;\n"
 "}\n"
-"\n"
-"\n"
 "void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
 "{\n"
 "	float4 clampedPoint = max(point2,bvhAabbMin);\n"
 "	clampedPoint = min (clampedPoint, bvhAabbMax);\n"
-"\n"
 "	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
 "	if (isMax)\n"
 "	{\n"
@@ -189,10 +168,7 @@ static const char* bvhTraversalKernelCL= \
 "		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
 "		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
 "	}\n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "// work-in-progress\n"
 "__kernel void   bvhTraversalKernel( __global const int4* pairs, \n"
 "									__global const BodyData* rigidBodies, \n"
@@ -223,7 +199,6 @@ static const char* bvhTraversalKernelCL= \
 "		\n"
 "	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
 "		return;\n"
-"\n"
 "	int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
 "		\n"
 "	if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
@@ -231,9 +206,7 @@ static const char* bvhTraversalKernelCL= \
 "		shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
 "		)\n"
 "		return;\n"
-"\n"
 "	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
-"\n"
 "	float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
 "	float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
 "	float4 bvhQuantization = bvhInfo.m_quantization;\n"
@@ -241,7 +214,6 @@ static const char* bvhTraversalKernelCL= \
 "	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
 "	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
 "	\n"
-"\n"
 "	unsigned short int quantizedQueryAabbMin[3];\n"
 "	unsigned short int quantizedQueryAabbMax[3];\n"
 "	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
@@ -308,6 +280,5 @@ static const char* bvhTraversalKernelCL= \
 "			}\n"
 "		}\n"
 "	}\n"
-"\n"
 "}\n"
 ;
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.cl
@@ -345,7 +345,7 @@ void	computeContactSphereConvex(int pairIndex,
 																__global const float4* convexVertices,
 																__global const int* convexIndices,
 																__global const btGpuFace* faces,
-																__global b3Contact4Data* restrict globalContactsOut,
+																__global struct b3Contact4Data* restrict globalContactsOut,
 																counter32_t nGlobalContactsOut,
 																int maxContactCapacity,
 																float4 spherePos2,
@@ -466,9 +466,9 @@ void	computeContactSphereConvex(int pairIndex,
 			
 			if (1)//dstIdx < maxContactCapacity)
 			{
-				__global b3Contact4Data* c = &globalContactsOut[dstIdx];
+				__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
 				c->m_worldNormal = normalOnSurfaceB1;
-				c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 				c->m_batchIdx = pairIndex;
 				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
 				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
@@ -590,7 +590,7 @@ int computeContactPlaneConvex(int pairIndex,
 								__global const float4* convexVertices,
 								__global const int* convexIndices,
 								__global const btGpuFace* faces,
-								__global b3Contact4Data* restrict globalContactsOut,
+								__global struct b3Contact4Data* restrict globalContactsOut,
 								counter32_t nGlobalContactsOut,
 								int maxContactCapacity,
 								float4 posB,
@@ -692,11 +692,11 @@ int computeContactPlaneConvex(int pairIndex,
 		if (dstIdx < maxContactCapacity)
 		{
 			resultIndex = dstIdx;
-			__global b3Contact4Data* c = &globalContactsOut[dstIdx];
+			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
 			c->m_worldNormal = planeNormalWorld;
 			//c->setFrictionCoeff(0.7);
 			//c->setRestituitionCoeff(0.f);
-			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 			c->m_batchIdx = pairIndex;
 			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
 			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
@@ -732,7 +732,7 @@ void	computeContactPlaneSphere(int pairIndex,
 																__global const BodyData* rigidBodies, 
 																__global const btCollidableGpu* collidables,
 																__global const btGpuFace* faces,
-																__global b3Contact4Data* restrict globalContactsOut,
+																__global struct b3Contact4Data* restrict globalContactsOut,
 																counter32_t nGlobalContactsOut,
 																int maxContactCapacity)
 {
@@ -775,9 +775,9 @@ void	computeContactPlaneSphere(int pairIndex,
 		
 		if (dstIdx < maxContactCapacity)
 		{
-			__global b3Contact4Data* c = &globalContactsOut[dstIdx];
+			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
 			c->m_worldNormal = normalOnSurfaceB1;
-			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 			c->m_batchIdx = pairIndex;
 			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
 			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
@@ -798,7 +798,7 @@ __kernel void   primitiveContactsKernel( __global int4* pairs,
 																					__global const float4* uniqueEdges,
 																					__global const btGpuFace* faces,
 																					__global const int* indices,
-																					__global b3Contact4Data* restrict globalContactsOut,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
 																					counter32_t nGlobalContactsOut,
 																					int numPairs, int maxContactCapacity)
 {
@@ -953,9 +953,9 @@ __kernel void   primitiveContactsKernel( __global int4* pairs,
 				
 				if (dstIdx < maxContactCapacity)
 				{
-					__global b3Contact4Data* c = &globalContactsOut[dstIdx];
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
 					c->m_worldNormal = -normalOnSurfaceB;
-					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 					c->m_batchIdx = pairIndex;
 					int bodyA = pairs[pairIndex].x;
 					int bodyB = pairs[pairIndex].y;
@@ -987,7 +987,7 @@ __kernel void   processCompoundPairsPrimitivesKernel( __global const int4* gpuCo
 													__global const int* indices,
 													__global btAabbCL* aabbs,
 													__global const btGpuChildShape* gpuChildShapes,
-													__global b3Contact4Data* restrict globalContactsOut,
+													__global struct b3Contact4Data* restrict globalContactsOut,
 													counter32_t nGlobalContactsOut,
 													int numCompoundPairs, int maxContactCapacity
 													)
@@ -1166,7 +1166,7 @@ void	computeContactSphereTriangle(int pairIndex,
 									__global const BodyData* rigidBodies, 
 									__global const btCollidableGpu* collidables,
 									const float4* triangleVertices,
-									__global b3Contact4Data* restrict globalContactsOut,
+									__global struct b3Contact4Data* restrict globalContactsOut,
 									counter32_t nGlobalContactsOut,
 									int maxContactCapacity,
 									float4 spherePos2,
@@ -1293,9 +1293,9 @@ void	computeContactSphereTriangle(int pairIndex,
 			
 				if (dstIdx < maxContactCapacity)
 				{
-					__global b3Contact4Data* c = &globalContactsOut[dstIdx];
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
 					c->m_worldNormal = normalOnSurfaceB1;
-					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 					c->m_batchIdx = pairIndex;
 					c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
 					c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
@@ -1325,7 +1325,7 @@ __kernel void   findConcaveSphereContactsKernel( __global int4* concavePairs,
 												__global const btGpuFace* faces,
 												__global const int* indices,
 												__global btAabbCL* aabbs,
-												__global b3Contact4Data* restrict globalContactsOut,
+												__global struct b3Contact4Data* restrict globalContactsOut,
 												counter32_t nGlobalContactsOut,
 													int numConcavePairs, int maxContactCapacity
 												)
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
@@ -1,67 +1,62 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
 static const char* primitiveContactsKernelsCL= \
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
 "struct MyTest\n"
 "{\n"
 "	int bla;\n"
 "};\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
 "	typedef float4	b3Float4;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"typedef struct\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
 "{\n"
 "	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
 "	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
-"	unsigned int  m_coeffs;\n"
-"	unsigned int m_batchIdx;\n"
-"\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
 "	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
 "	int m_bodyBPtrAndSignBit;\n"
-"\n"
 "	int	m_childIndexA;\n"
 "	int	m_childIndexB;\n"
 "	int m_unused1;\n"
 "	int m_unused2;\n"
-"\n"
-"} b3Contact4Data;\n"
-"\n"
-"\n"
-"\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#define SHAPE_CONVEX_HULL 3\n"
 "#define SHAPE_PLANE 4\n"
 "#define SHAPE_CONCAVE_TRIMESH 5\n"
 "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
 "#define SHAPE_SPHERE 7\n"
-"\n"
-"\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile __global int*\n"
 "#endif\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -75,15 +70,9 @@ static const char* primitiveContactsKernelsCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
 "typedef unsigned int u32;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	union\n"
@@ -99,7 +88,6 @@ static const char* primitiveContactsKernelsCL= \
 "		int			m_maxIndices[4];\n"
 "	};\n"
 "} btAabbCL;\n"
-"\n"
 "///keep this in sync with btCollidable.h\n"
 "typedef struct\n"
 "{\n"
@@ -109,7 +97,6 @@ static const char* primitiveContactsKernelsCL= \
 "	int m_shapeIndex;\n"
 "	\n"
 "} btCollidableGpu;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4	m_childPosition;\n"
@@ -119,23 +106,18 @@ static const char* primitiveContactsKernelsCL= \
 "	int m_unused1;\n"
 "	int m_unused2;\n"
 "} btGpuChildShape;\n"
-"\n"
 "#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_collidableIdx;	\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} BodyData;\n"
-"\n"
-"\n"
 "typedef struct  \n"
 "{\n"
 "	float4		m_localCenter;\n"
@@ -152,48 +134,37 @@ static const char* primitiveContactsKernelsCL= \
 "	int	m_uniqueEdgesOffset;\n"
 "	int	m_numUniqueEdges;\n"
 "	int m_unused;\n"
-"\n"
 "} ConvexPolyhedronCL;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_plane;\n"
 "	int m_indexOffset;\n"
 "	int m_numIndices;\n"
 "} btGpuFace;\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "__inline\n"
 "float fastDiv(float numerator, float denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "//	return numerator/denominator;	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastDiv4(float4 numerator, float4 denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "//#define dot3F4 dot\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -201,35 +172,23 @@ static const char* primitiveContactsKernelsCL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -240,7 +199,6 @@ static const char* primitiveContactsKernelsCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -257,32 +215,27 @@ static const char* primitiveContactsKernelsCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
 "{\n"
 "	return qtRotate( *orientation, *p ) + (*translation);\n"
 "}\n"
-"\n"
 "void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
 "		float4* translationOut, Quaternion* orientationOut)\n"
 "{\n"
 "	*orientationOut = qtInvert(orientationIn);\n"
 "	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
 "}\n"
-"\n"
 "void	trMul(float4 translationA, Quaternion orientationA,\n"
 "						float4 translationB, Quaternion orientationB,\n"
 "		float4* translationOut, Quaternion* orientationOut)\n"
@@ -290,17 +243,12 @@ static const char* primitiveContactsKernelsCL= \
 "	*orientationOut = qtMul(orientationA,orientationB);\n"
 "	*translationOut = transform(&translationB,&translationA,&orientationA);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
 "	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
 "	return fastNormalize4( n );\n"
 "}\n"
-"\n"
-"\n"
 "__inline float4 lerp3(const float4 a,const float4 b, float  t)\n"
 "{\n"
 "	return make_float4(	a.x + (b.x - a.x) * t,\n"
@@ -308,8 +256,6 @@ static const char* primitiveContactsKernelsCL= \
 "						a.z + (b.z - a.z) * t,\n"
 "						0.f);\n"
 "}\n"
-"\n"
-"\n"
 "float signedDistanceFromPointToPlane(float4 point, float4 planeEqn, float4* closestPointOnFace)\n"
 "{\n"
 "	float4 n = (float4)(planeEqn.x, planeEqn.y, planeEqn.z, 0);\n"
@@ -317,9 +263,6 @@ static const char* primitiveContactsKernelsCL= \
 "	*closestPointOnFace = point - dist * n;\n"
 "	return dist;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "inline bool IsPointInPolygon(float4 p, \n"
 "							const btGpuFace* face,\n"
 "							__global const float4* baseVertex,\n"
@@ -331,17 +274,14 @@ static const char* primitiveContactsKernelsCL= \
 "    float4 ab;\n"
 "    float4 ap;\n"
 "    float4 v;\n"
-"\n"
 "	float4 plane = make_float4(face->m_plane.x,face->m_plane.y,face->m_plane.z,0.f);\n"
 "	\n"
 "	if (face->m_numIndices<2)\n"
 "		return false;\n"
 "	\n"
-"	\n"
 "	float4 v0 = baseVertex[convexIndices[face->m_indexOffset + face->m_numIndices-1]];\n"
 "	\n"
 "	b = v0;\n"
-"\n"
 "    for(unsigned i=0; i != face->m_numIndices; ++i)\n"
 "    {\n"
 "		a = b;\n"
@@ -350,7 +290,6 @@ static const char* primitiveContactsKernelsCL= \
 "        ab = b-a;\n"
 "        ap = p-a;\n"
 "        v = cross3(ab,plane);\n"
-"\n"
 "        if (dot(ap, v) > 0.f)\n"
 "        {\n"
 "            float ab_m2 = dot(ab, ab);\n"
@@ -375,10 +314,6 @@ static const char* primitiveContactsKernelsCL= \
 "    }\n"
 "    return true;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "void	computeContactSphereConvex(int pairIndex,\n"
 "																int bodyIndexA, int bodyIndexB, \n"
 "																int collidableIndexA, int collidableIndexB, \n"
@@ -388,7 +323,7 @@ static const char* primitiveContactsKernelsCL= \
 "																__global const float4* convexVertices,\n"
 "																__global const int* convexIndices,\n"
 "																__global const btGpuFace* faces,\n"
-"																__global b3Contact4Data* restrict globalContactsOut,\n"
+"																__global struct b3Contact4Data* restrict globalContactsOut,\n"
 "																counter32_t nGlobalContactsOut,\n"
 "																int maxContactCapacity,\n"
 "																float4 spherePos2,\n"
@@ -397,25 +332,19 @@ static const char* primitiveContactsKernelsCL= \
 "																float4 quat\n"
 "																)\n"
 "{\n"
-"\n"
 "	float4 invPos;\n"
 "	float4 invOrn;\n"
-"\n"
 "	trInverse(pos,quat, &invPos,&invOrn);\n"
-"\n"
 "	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n"
-"\n"
 "	int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n"
 "	int numFaces = convexShapes[shapeIndex].m_numFaces;\n"
 "	float4 closestPnt = (float4)(0, 0, 0, 0);\n"
 "	float4 hitNormalWorld = (float4)(0, 0, 0, 0);\n"
 "	float minDist = -1000000.f;\n"
 "	bool bCollide = true;\n"
-"\n"
 "	for ( int f = 0; f < numFaces; f++ )\n"
 "	{\n"
 "		btGpuFace face = faces[convexShapes[shapeIndex].m_faceOffset+f];\n"
-"\n"
 "		// set up a plane equation \n"
 "		float4 planeEqn;\n"
 "		float4 n1 = face.m_plane;\n"
@@ -427,21 +356,17 @@ static const char* primitiveContactsKernelsCL= \
 "		// compute a signed distance from the vertex in cloth to the face of rigidbody.\n"
 "		float4 pntReturn;\n"
 "		float dist = signedDistanceFromPointToPlane(spherePos, planeEqn, &pntReturn);\n"
-"\n"
 "		// If the distance is positive, the plane is a separating plane. \n"
 "		if ( dist > radius )\n"
 "		{\n"
 "			bCollide = false;\n"
 "			break;\n"
 "		}\n"
-"\n"
-"\n"
 "		if (dist>0)\n"
 "		{\n"
 "			//might hit an edge or vertex\n"
 "			float4 out;\n"
 "			float4 zeroPos = make_float4(0,0,0,0);\n"
-"\n"
 "			bool isInPoly = IsPointInPolygon(spherePos,\n"
 "					&face,\n"
 "					&convexVertices[convexShapes[shapeIndex].m_vertexOffset],\n"
@@ -489,8 +414,6 @@ static const char* primitiveContactsKernelsCL= \
 "		\n"
 "	}\n"
 "	\n"
-"	\n"
-"\n"
 "	if (bCollide && minDist > -10000)\n"
 "	{\n"
 "		float4 normalOnSurfaceB1 = qtRotate(quat,-hitNormalWorld);\n"
@@ -500,35 +423,28 @@ static const char* primitiveContactsKernelsCL= \
 "		if (actualDepth<=0.f)\n"
 "		{\n"
 "			\n"
-"\n"
 "			pOnB1.w = actualDepth;\n"
-"\n"
 "			int dstIdx;\n"
 "			AppendInc( nGlobalContactsOut, dstIdx );\n"
 "		\n"
 "			\n"
 "			if (1)//dstIdx < maxContactCapacity)\n"
 "			{\n"
-"				__global b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"				__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
 "				c->m_worldNormal = normalOnSurfaceB1;\n"
-"				c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
 "				c->m_batchIdx = pairIndex;\n"
 "				c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
 "				c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
 "				c->m_worldPos[0] = pOnB1;\n"
 "				c->m_childIndexA = -1;\n"
 "				c->m_childIndexB = -1;\n"
-"\n"
 "				GET_NPOINTS(*c) = 1;\n"
 "			} \n"
-"\n"
 "		}\n"
 "	}//if (hasCollision)\n"
-"\n"
 "}\n"
 "							\n"
-"\n"
-"\n"
 "int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n"
 "{\n"
 "	if( nPoints == 0 )\n"
@@ -621,9 +537,7 @@ static const char* primitiveContactsKernelsCL= \
 "    return 4;\n"
 "    \n"
 "}\n"
-"\n"
 "#define MAX_PLANE_CONVEX_POINTS 64\n"
-"\n"
 "int computeContactPlaneConvex(int pairIndex,\n"
 "								int bodyIndexA, int bodyIndexB, \n"
 "								int collidableIndexA, int collidableIndexB, \n"
@@ -633,7 +547,7 @@ static const char* primitiveContactsKernelsCL= \
 "								__global const float4* convexVertices,\n"
 "								__global const int* convexIndices,\n"
 "								__global const btGpuFace* faces,\n"
-"								__global b3Contact4Data* restrict globalContactsOut,\n"
+"								__global struct b3Contact4Data* restrict globalContactsOut,\n"
 "								counter32_t nGlobalContactsOut,\n"
 "								int maxContactCapacity,\n"
 "								float4 posB,\n"
@@ -641,7 +555,6 @@ static const char* primitiveContactsKernelsCL= \
 "								)\n"
 "{\n"
 "	int resultIndex=-1;\n"
-"\n"
 "		int shapeIndex = collidables[collidableIndexB].m_shapeIndex;\n"
 "	__global const ConvexPolyhedronCL* hullB = &convexShapes[shapeIndex];\n"
 "	\n"
@@ -649,10 +562,8 @@ static const char* primitiveContactsKernelsCL= \
 "	posA = rigidBodies[bodyIndexA].m_pos;\n"
 "	Quaternion ornA;\n"
 "	ornA = rigidBodies[bodyIndexA].m_quat;\n"
-"\n"
 "	int numContactsOut = 0;\n"
 "	int numWorldVertsB1= 0;\n"
-"\n"
 "	float4 planeEq;\n"
 "	 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n"
 "	float4 planeNormal = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n"
@@ -675,17 +586,12 @@ static const char* primitiveContactsKernelsCL= \
 "		trMul(invPosB,invOrnB,posA,ornA,&planeInConvexPos1,&planeInConvexOrn1);	\n"
 "	}\n"
 "	\n"
-"	\n"
 "	float4 planeNormalInConvex = qtRotate(planeInConvexOrn1,-planeNormal);\n"
 "	float maxDot = -1e30;\n"
 "	int hitVertex=-1;\n"
 "	float4 hitVtx;\n"
-"\n"
-"\n"
-"\n"
 "	float4 contactPoints[MAX_PLANE_CONVEX_POINTS];\n"
 "	int numPoints = 0;\n"
-"\n"
 "	int4 contactIdx;\n"
 "	contactIdx=make_int4(0,1,2,3);\n"
 "    \n"
@@ -694,8 +600,6 @@ static const char* primitiveContactsKernelsCL= \
 "	{\n"
 "		float4 vtx = convexVertices[hullB->m_vertexOffset+i];\n"
 "		float curDot = dot(vtx,planeNormalInConvex);\n"
-"\n"
-"\n"
 "		if (curDot>maxDot)\n"
 "		{\n"
 "			hitVertex=i;\n"
@@ -705,7 +609,6 @@ static const char* primitiveContactsKernelsCL= \
 "			if (numPoints==MAX_PLANE_CONVEX_POINTS)\n"
 "				numPoints--;\n"
 "		}\n"
-"\n"
 "		if (numPoints<MAX_PLANE_CONVEX_POINTS)\n"
 "		{\n"
 "			float4 vtxWorld = transform(&vtx, &posB, &ornB);\n"
@@ -718,34 +621,29 @@ static const char* primitiveContactsKernelsCL= \
 "				numPoints++;\n"
 "			}\n"
 "		}\n"
-"\n"
 "	}\n"
-"\n"
 "	int numReducedPoints  = numPoints;\n"
 "	if (numPoints>4)\n"
 "	{\n"
 "		numReducedPoints = extractManifoldSequential( contactPoints, numPoints, planeNormalInConvex, &contactIdx);\n"
 "	}\n"
-"\n"
 "	if (numReducedPoints>0)\n"
 "	{\n"
 "		int dstIdx;\n"
 "	    AppendInc( nGlobalContactsOut, dstIdx );\n"
-"\n"
 "		if (dstIdx < maxContactCapacity)\n"
 "		{\n"
 "			resultIndex = dstIdx;\n"
-"			__global b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
 "			c->m_worldNormal = planeNormalWorld;\n"
 "			//c->setFrictionCoeff(0.7);\n"
 "			//c->setRestituitionCoeff(0.f);\n"
-"			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
 "			c->m_batchIdx = pairIndex;\n"
 "			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
 "			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
 "			c->m_childIndexA = -1;\n"
 "			c->m_childIndexB = -1;\n"
-"\n"
 "			switch (numReducedPoints)\n"
 "            {\n"
 "                case 4:\n"
@@ -764,18 +662,15 @@ static const char* primitiveContactsKernelsCL= \
 "			GET_NPOINTS(*c) = numReducedPoints;\n"
 "		}//if (dstIdx < numPairs)\n"
 "	}	\n"
-"\n"
 "	return resultIndex;\n"
 "}\n"
-"\n"
-"\n"
 "void	computeContactPlaneSphere(int pairIndex,\n"
 "																int bodyIndexA, int bodyIndexB, \n"
 "																int collidableIndexA, int collidableIndexB, \n"
 "																__global const BodyData* rigidBodies, \n"
 "																__global const btCollidableGpu* collidables,\n"
 "																__global const btGpuFace* faces,\n"
-"																__global b3Contact4Data* restrict globalContactsOut,\n"
+"																__global struct b3Contact4Data* restrict globalContactsOut,\n"
 "																counter32_t nGlobalContactsOut,\n"
 "																int maxContactCapacity)\n"
 "{\n"
@@ -812,15 +707,14 @@ static const char* primitiveContactsKernelsCL= \
 "		float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);\n"
 "		float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;\n"
 "		pOnB1.w = distance;\n"
-"\n"
 "		int dstIdx;\n"
 "    AppendInc( nGlobalContactsOut, dstIdx );\n"
 "		\n"
 "		if (dstIdx < maxContactCapacity)\n"
 "		{\n"
-"			__global b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"			__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
 "			c->m_worldNormal = normalOnSurfaceB1;\n"
-"			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
 "			c->m_batchIdx = pairIndex;\n"
 "			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
 "			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
@@ -831,8 +725,6 @@ static const char* primitiveContactsKernelsCL= \
 "		}//if (dstIdx < numPairs)\n"
 "	}//if (hasCollision)\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void   primitiveContactsKernel( __global int4* pairs, \n"
 "																					__global const BodyData* rigidBodies, \n"
 "																					__global const btCollidableGpu* collidables,\n"
@@ -841,27 +733,23 @@ static const char* primitiveContactsKernelsCL= \
 "																					__global const float4* uniqueEdges,\n"
 "																					__global const btGpuFace* faces,\n"
 "																					__global const int* indices,\n"
-"																					__global b3Contact4Data* restrict globalContactsOut,\n"
+"																					__global struct b3Contact4Data* restrict globalContactsOut,\n"
 "																					counter32_t nGlobalContactsOut,\n"
 "																					int numPairs, int maxContactCapacity)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	int pairIndex = i;\n"
 "	\n"
 "	float4 worldVertsB1[64];\n"
 "	float4 worldVertsB2[64];\n"
 "	int capacityWorldVerts = 64;	\n"
-"\n"
 "	float4 localContactsOut[64];\n"
 "	int localContactCapacity=64;\n"
 "	\n"
 "	float minDist = -1e30f;\n"
 "	float maxDist = 0.02f;\n"
-"\n"
 "	if (i<numPairs)\n"
 "	{\n"
-"\n"
 "		int bodyIndexA = pairs[i].x;\n"
 "		int bodyIndexB = pairs[i].y;\n"
 "			\n"
@@ -871,7 +759,6 @@ static const char* primitiveContactsKernelsCL= \
 "		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n"
 "			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n"
 "		{\n"
-"\n"
 "			float4 posB;\n"
 "			posB = rigidBodies[bodyIndexB].m_pos;\n"
 "			Quaternion ornB;\n"
@@ -881,31 +768,22 @@ static const char* primitiveContactsKernelsCL= \
 "																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity, posB,ornB);\n"
 "			if (contactIndex>=0)\n"
 "				pairs[pairIndex].z = contactIndex;\n"
-"\n"
 "			return;\n"
 "		}\n"
-"\n"
-"\n"
 "		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n"
 "			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n"
 "		{\n"
-"\n"
 "			float4 posA;\n"
 "			posA = rigidBodies[bodyIndexA].m_pos;\n"
 "			Quaternion ornA;\n"
 "			ornA = rigidBodies[bodyIndexA].m_quat;\n"
-"\n"
-"\n"
 "			int contactIndex = computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
 "																rigidBodies,collidables,convexShapes,vertices,indices,\n"
 "																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n"
-"\n"
 "			if (contactIndex>=0)\n"
 "				pairs[pairIndex].z = contactIndex;\n"
-"\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n"
 "			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
 "		{\n"
@@ -913,23 +791,16 @@ static const char* primitiveContactsKernelsCL= \
 "																rigidBodies,collidables,faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n"
 "			return;\n"
 "		}\n"
-"\n"
-"\n"
 "		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
 "			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n"
 "		{\n"
-"\n"
-"\n"
 "			computeContactPlaneSphere( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
 "																rigidBodies,collidables,\n"
 "																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity);\n"
-"\n"
 "			return;\n"
 "		}\n"
 "		\n"
 "	\n"
-"\n"
-"	\n"
 "		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
 "			collidables[collidableIndexB].m_shapeType == SHAPE_CONVEX_HULL)\n"
 "		{\n"
@@ -938,14 +809,11 @@ static const char* primitiveContactsKernelsCL= \
 "			float sphereRadius = collidables[collidableIndexA].m_radius;\n"
 "			float4 convexPos = rigidBodies[bodyIndexB].m_pos;\n"
 "			float4 convexOrn = rigidBodies[bodyIndexB].m_quat;\n"
-"\n"
 "			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
 "																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
 "																spherePos,sphereRadius,convexPos,convexOrn);\n"
-"\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		if (collidables[collidableIndexA].m_shapeType == SHAPE_CONVEX_HULL &&\n"
 "			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
 "		{\n"
@@ -954,7 +822,6 @@ static const char* primitiveContactsKernelsCL= \
 "			float sphereRadius = collidables[collidableIndexB].m_radius;\n"
 "			float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n"
 "			float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n"
-"\n"
 "			computeContactSphereConvex(pairIndex, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n"
 "																rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
 "																spherePos,sphereRadius,convexPos,convexOrn);\n"
@@ -974,7 +841,6 @@ static const char* primitiveContactsKernelsCL= \
 "			float radiusB = collidables[collidableIndexB].m_radius;\n"
 "			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
 "			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
-"\n"
 "			float4 diff = posA-posB;\n"
 "			float len = length(diff);\n"
 "			\n"
@@ -996,9 +862,9 @@ static const char* primitiveContactsKernelsCL= \
 "				\n"
 "				if (dstIdx < maxContactCapacity)\n"
 "				{\n"
-"					__global b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
 "					c->m_worldNormal = -normalOnSurfaceB;\n"
-"					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
 "					c->m_batchIdx = pairIndex;\n"
 "					int bodyA = pairs[pairIndex].x;\n"
 "					int bodyB = pairs[pairIndex].y;\n"
@@ -1010,15 +876,10 @@ static const char* primitiveContactsKernelsCL= \
 "					GET_NPOINTS(*c) = 1;\n"
 "				}//if (dstIdx < numPairs)\n"
 "			}//if ( len <= (radiusA+radiusB))\n"
-"\n"
 "			return;\n"
 "		}//SHAPE_SPHERE SHAPE_SPHERE\n"
-"\n"
 "	}//	if (i<numPairs)\n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "// work-in-progress\n"
 "__kernel void   processCompoundPairsPrimitivesKernel( __global const int4* gpuCompoundPairs,\n"
 "													__global const BodyData* rigidBodies, \n"
@@ -1030,18 +891,16 @@ static const char* primitiveContactsKernelsCL= \
 "													__global const int* indices,\n"
 "													__global btAabbCL* aabbs,\n"
 "													__global const btGpuChildShape* gpuChildShapes,\n"
-"													__global b3Contact4Data* restrict globalContactsOut,\n"
+"													__global struct b3Contact4Data* restrict globalContactsOut,\n"
 "													counter32_t nGlobalContactsOut,\n"
 "													int numCompoundPairs, int maxContactCapacity\n"
 "													)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	if (i<numCompoundPairs)\n"
 "	{\n"
 "		int bodyIndexA = gpuCompoundPairs[i].x;\n"
 "		int bodyIndexB = gpuCompoundPairs[i].y;\n"
-"\n"
 "		int childShapeIndexA = gpuCompoundPairs[i].z;\n"
 "		int childShapeIndexB = gpuCompoundPairs[i].w;\n"
 "		\n"
@@ -1087,26 +946,21 @@ static const char* primitiveContactsKernelsCL= \
 "	\n"
 "		int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n"
 "		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
-"\n"
 "		int pairIndex = i;\n"
 "		if ((shapeTypeA == SHAPE_PLANE) && (shapeTypeB==SHAPE_CONVEX_HULL))\n"
 "		{\n"
-"\n"
 "			computeContactPlaneConvex( pairIndex, bodyIndexA,bodyIndexB,  collidableIndexA,collidableIndexB, \n"
 "																rigidBodies,collidables,convexShapes,vertices,indices,\n"
 "																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posB,ornB);\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB==SHAPE_PLANE))\n"
 "		{\n"
-"\n"
 "			computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
 "																rigidBodies,collidables,convexShapes,vertices,indices,\n"
 "																faces,	globalContactsOut, nGlobalContactsOut,maxContactCapacity,posA,ornA);\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		if ((shapeTypeA == SHAPE_CONVEX_HULL) && (shapeTypeB == SHAPE_SPHERE))\n"
 "		{\n"
 "			float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
@@ -1120,16 +974,13 @@ static const char* primitiveContactsKernelsCL= \
 "	\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		if ((shapeTypeA == SHAPE_SPHERE) && (shapeTypeB == SHAPE_CONVEX_HULL))\n"
 "		{\n"
-"\n"
 "			float4 spherePos = rigidBodies[bodyIndexA].m_pos;\n"
 "			float sphereRadius = collidables[collidableIndexA].m_radius;\n"
 "			float4 convexPos = posB;\n"
 "			float4 convexOrn = ornB;\n"
 "			\n"
-"			\n"
 "			computeContactSphereConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
 "										rigidBodies,collidables,convexShapes,vertices,indices,faces, globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
 "										spherePos,sphereRadius,convexPos,convexOrn);\n"
@@ -1138,30 +989,23 @@ static const char* primitiveContactsKernelsCL= \
 "		}\n"
 "	}//	if (i<numCompoundPairs)\n"
 "}\n"
-"\n"
-"\n"
 "bool pointInTriangle(const float4* vertices, const float4* normal, float4 *p )\n"
 "{\n"
-"\n"
 "	const float4* p1 = &vertices[0];\n"
 "	const float4* p2 = &vertices[1];\n"
 "	const float4* p3 = &vertices[2];\n"
-"\n"
 "	float4 edge1;	edge1 = (*p2 - *p1);\n"
 "	float4 edge2;	edge2 = ( *p3 - *p2 );\n"
 "	float4 edge3;	edge3 = ( *p1 - *p3 );\n"
 "	\n"
-"	\n"
 "	float4 p1_to_p; p1_to_p = ( *p - *p1 );\n"
 "	float4 p2_to_p; p2_to_p = ( *p - *p2 );\n"
 "	float4 p3_to_p; p3_to_p = ( *p - *p3 );\n"
-"\n"
 "	float4 edge1_normal; edge1_normal = ( cross(edge1,*normal));\n"
 "	float4 edge2_normal; edge2_normal = ( cross(edge2,*normal));\n"
 "	float4 edge3_normal; edge3_normal = ( cross(edge3,*normal));\n"
 "	\n"
 "	\n"
-"	\n"
 "	float r1, r2, r3;\n"
 "	r1 = dot(edge1_normal,p1_to_p );\n"
 "	r2 = dot(edge2_normal,p2_to_p );\n"
@@ -1172,10 +1016,7 @@ static const char* primitiveContactsKernelsCL= \
 "    if ( r1 <= 0 && r2 <= 0 && r3 <= 0 ) \n"
 "		return true;\n"
 "	return false;\n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "float segmentSqrDistance(float4 from, float4 to,float4 p, float4* nearest) \n"
 "{\n"
 "	float4 diff = p - from;\n"
@@ -1201,15 +1042,13 @@ static const char* primitiveContactsKernelsCL= \
 "	*nearest = from + t*v;\n"
 "	return dot(diff,diff);	\n"
 "}\n"
-"\n"
-"\n"
 "void	computeContactSphereTriangle(int pairIndex,\n"
 "									int bodyIndexA, int bodyIndexB,\n"
 "									int collidableIndexA, int collidableIndexB, \n"
 "									__global const BodyData* rigidBodies, \n"
 "									__global const btCollidableGpu* collidables,\n"
 "									const float4* triangleVertices,\n"
-"									__global b3Contact4Data* restrict globalContactsOut,\n"
+"									__global struct b3Contact4Data* restrict globalContactsOut,\n"
 "									counter32_t nGlobalContactsOut,\n"
 "									int maxContactCapacity,\n"
 "									float4 spherePos2,\n"
@@ -1219,10 +1058,8 @@ static const char* primitiveContactsKernelsCL= \
 "									int faceIndex\n"
 "									)\n"
 "{\n"
-"\n"
 "	float4 invPos;\n"
 "	float4 invOrn;\n"
-"\n"
 "	trInverse(pos,quat, &invPos,&invOrn);\n"
 "	float4 spherePos = transform(&spherePos2,&invPos,&invOrn);\n"
 "	int numFaces = 3;\n"
@@ -1231,12 +1068,9 @@ static const char* primitiveContactsKernelsCL= \
 "	float minDist = -1000000.f;\n"
 "	bool bCollide = false;\n"
 "	\n"
-"	\n"
 "	//////////////////////////////////////\n"
-"\n"
 "	float4 sphereCenter;\n"
 "	sphereCenter = spherePos;\n"
-"\n"
 "	const float4* vertices = triangleVertices;\n"
 "	float contactBreakingThreshold = 0.f;//todo?\n"
 "	float radiusWithThreshold = radius + contactBreakingThreshold;\n"
@@ -1252,7 +1086,6 @@ static const char* primitiveContactsKernelsCL= \
 "	p1ToCenter = sphereCenter - vertices[0];\n"
 "	\n"
 "	float distanceFromPlane = dot(p1ToCenter,normal);\n"
-"\n"
 "	if (distanceFromPlane < 0.f)\n"
 "	{\n"
 "		//triangle facing the other way\n"
@@ -1260,7 +1093,6 @@ static const char* primitiveContactsKernelsCL= \
 "		normal *= -1.f;\n"
 "	}\n"
 "	hitNormalWorld = normal;\n"
-"\n"
 "	bool isInsideContactPlane = distanceFromPlane < radiusWithThreshold;\n"
 "	\n"
 "	// Check for contact / intersection\n"
@@ -1284,7 +1116,6 @@ static const char* primitiveContactsKernelsCL= \
 "			{\n"
 "				float4 pa =vertices[i];\n"
 "				float4 pb = vertices[(i+1)%3];\n"
-"\n"
 "				float distanceSqr = segmentSqrDistance(pa,pb,sphereCenter, &nearestOnEdge);\n"
 "				if (distanceSqr < contactCapsuleRadiusSqr) \n"
 "				{\n"
@@ -1297,10 +1128,8 @@ static const char* primitiveContactsKernelsCL= \
 "			}\n"
 "		}\n"
 "	}\n"
-"\n"
 "	if (hasContact) \n"
 "	{\n"
-"\n"
 "		closestPnt = contactPoint;\n"
 "		float4 contactToCenter = sphereCenter - contactPoint;\n"
 "		minDist = length(contactToCenter);\n"
@@ -1311,10 +1140,7 @@ static const char* primitiveContactsKernelsCL= \
 "		}\n"
 "		\n"
 "	}\n"
-"\n"
-"\n"
 "	/////////////////////////////////////\n"
-"\n"
 "	if (bCollide && minDist > -10000)\n"
 "	{\n"
 "		\n"
@@ -1322,13 +1148,11 @@ static const char* primitiveContactsKernelsCL= \
 "		float4 pOnB1 = transform(&closestPnt,&pos,&quat);\n"
 "		float actualDepth = minDist-radius;\n"
 "		\n"
-"		\n"
 "		if (actualDepth<=0.f)\n"
 "		{\n"
 "			pOnB1.w = actualDepth;\n"
 "			int dstIdx;\n"
 "			\n"
-"			\n"
 "			float lenSqr = dot3F4(normalOnSurfaceB1,normalOnSurfaceB1);\n"
 "			if (lenSqr>FLT_EPSILON)\n"
 "			{\n"
@@ -1336,28 +1160,21 @@ static const char* primitiveContactsKernelsCL= \
 "			\n"
 "				if (dstIdx < maxContactCapacity)\n"
 "				{\n"
-"					__global b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
+"					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];\n"
 "					c->m_worldNormal = normalOnSurfaceB1;\n"
-"					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
 "					c->m_batchIdx = pairIndex;\n"
 "					c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
 "					c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
 "					c->m_worldPos[0] = pOnB1;\n"
-"\n"
 "					c->m_childIndexA = -1;\n"
 "					c->m_childIndexB = faceIndex;\n"
-"\n"
 "					GET_NPOINTS(*c) = 1;\n"
 "				} \n"
 "			}\n"
-"\n"
 "		}\n"
 "	}//if (hasCollision)\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "// work-in-progress\n"
 "__kernel void   findConcaveSphereContactsKernel( __global int4* concavePairs,\n"
 "												__global const BodyData* rigidBodies,\n"
@@ -1368,26 +1185,21 @@ static const char* primitiveContactsKernelsCL= \
 "												__global const btGpuFace* faces,\n"
 "												__global const int* indices,\n"
 "												__global btAabbCL* aabbs,\n"
-"												__global b3Contact4Data* restrict globalContactsOut,\n"
+"												__global struct b3Contact4Data* restrict globalContactsOut,\n"
 "												counter32_t nGlobalContactsOut,\n"
 "													int numConcavePairs, int maxContactCapacity\n"
 "												)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numConcavePairs)\n"
 "		return;\n"
 "	int pairIdx = i;\n"
-"\n"
 "	int bodyIndexA = concavePairs[i].x;\n"
 "	int bodyIndexB = concavePairs[i].y;\n"
-"\n"
 "	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
 "	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
-"\n"
 "	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
 "	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
-"\n"
 "	if (collidables[collidableIndexB].m_shapeType==SHAPE_SPHERE)\n"
 "	{\n"
 "		int f = concavePairs[i].z;\n"
@@ -1400,18 +1212,15 @@ static const char* primitiveContactsKernelsCL= \
 "			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
 "			verticesA[i] = vert;\n"
 "		}\n"
-"\n"
 "		float4 spherePos = rigidBodies[bodyIndexB].m_pos;\n"
 "		float sphereRadius = collidables[collidableIndexB].m_radius;\n"
 "		float4 convexPos = rigidBodies[bodyIndexA].m_pos;\n"
 "		float4 convexOrn = rigidBodies[bodyIndexA].m_quat;\n"
-"\n"
 "		computeContactSphereTriangle(i, bodyIndexB, bodyIndexA, collidableIndexB, collidableIndexA, \n"
 "																rigidBodies,collidables,\n"
 "																verticesA,\n"
 "																globalContactsOut, nGlobalContactsOut,maxContactCapacity,\n"
 "																spherePos,sphereRadius,convexPos,convexOrn, f);\n"
-"\n"
 "		return;\n"
 "	}\n"
 "}\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
@@ -41,22 +41,7 @@ typedef unsigned int u32;



-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;	//	w: m_nPoints
-
-	u32 m_coeffs;
-	u32 m_batchIdx;
-	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	float m_unused1;
-	int m_unused2;
-
-} Contact4;
+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"


 ///keep this in sync with btCollidable.h
@@ -891,7 +876,7 @@ __kernel void   extractManifoldAndAddContactKernel(__global const int4* pairs,
 																	__global const float4* separatingNormalsWorld,
 																	__global const int* contactCounts,
 																	__global const int* contactOffsets,
-																	__global Contact4* restrict contactsOut,
+																	__global struct b3Contact4Data* restrict contactsOut,
 																	counter32_t nContactsOut,
 																	int numPairs,
 																	int pairIndex
@@ -922,9 +907,9 @@ __kernel void   extractManifoldAndAddContactKernel(__global const int4* pairs,
 		AppendInc( nContactsOut, dstIdx );
 		//if ((dstIdx+nContacts) < capacity)
 		{
-			__global Contact4* c = contactsOut + dstIdx;
+			__global struct b3Contact4Data* c = contactsOut + dstIdx;
 			c->m_worldNormal = normal;
-			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+			c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 			c->m_batchIdx = idx;
 			int bodyA = pairs[pairIndex].x;
 			int bodyB = pairs[pairIndex].y;
@@ -970,7 +955,7 @@ __kernel void   clipHullHullKernel( __global int4* pairs,
 																					__global const int* indices,
 																					__global const float4* separatingNormals,
 																					__global const int* hasSeparatingAxis,
-																					__global Contact4* restrict globalContactsOut,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
 																					counter32_t nGlobalContactsOut,
 																					int numPairs,
 																					int contactCapacity)
@@ -1037,9 +1022,9 @@ __kernel void   clipHullHullKernel( __global int4* pairs,
 				{
 					pairs[pairIndex].z = dstIdx;

-					__global Contact4* c = globalContactsOut+ dstIdx;
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
 					c->m_worldNormal = normal;
-					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 					c->m_batchIdx = pairIndex;
 					int bodyA = pairs[pairIndex].x;
 					int bodyB = pairs[pairIndex].y;
@@ -1073,7 +1058,7 @@ __kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPai
 																					__global const btGpuChildShape* gpuChildShapes,
 																					__global const float4* gpuCompoundSepNormalsOut,
 																					__global const int* gpuHasCompoundSepNormalsOut,
-																					__global Contact4* restrict globalContactsOut,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
 																					counter32_t nGlobalContactsOut,
 																					int numCompoundPairs, int maxContactCapacity)
 {
@@ -1170,9 +1155,9 @@ __kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPai
 				AppendInc( nGlobalContactsOut, dstIdx );
 				if ((dstIdx+nReducedContacts) < maxContactCapacity)
 				{
-					__global Contact4* c = globalContactsOut+ dstIdx;
+					__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
 					c->m_worldNormal = normal;
-					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 					c->m_batchIdx = pairIndex;
 					int bodyA = gpuCompoundPairs[pairIndex].x;
 					int bodyB = gpuCompoundPairs[pairIndex].y;
@@ -1200,7 +1185,7 @@ __kernel void   sphereSphereCollisionKernel( __global const int4* pairs,
 																					__global const btCollidableGpu* collidables,
 																					__global const float4* separatingNormals,
 																					__global const int* hasSeparatingAxis,
-																					__global Contact4* restrict globalContactsOut,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
 																					counter32_t nGlobalContactsOut,
 																					int numPairs)
 {
@@ -1246,9 +1231,9 @@ __kernel void   sphereSphereCollisionKernel( __global const int4* pairs,
 				
 				if (dstIdx < numPairs)
 				{
-					__global Contact4* c = &globalContactsOut[dstIdx];
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
 					c->m_worldNormal = normalOnSurfaceB;
-					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 					c->m_batchIdx = pairIndex;
 					int bodyA = pairs[pairIndex].x;
 					int bodyB = pairs[pairIndex].y;
@@ -1275,7 +1260,7 @@ __kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,
 																					__global const int* indices,
 																					__global const btGpuChildShape* gpuChildShapes,
 																					__global const float4* separatingNormals,
-																					__global Contact4* restrict globalContactsOut,
+																					__global struct b3Contact4Data* restrict globalContactsOut,
 																					counter32_t nGlobalContactsOut,
 																					int numConcavePairs)
 {
@@ -1479,9 +1464,9 @@ __kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,
 			AppendInc( nGlobalContactsOut, dstIdx );
 			//if ((dstIdx+nReducedContacts) < capacity)
 			{
-				__global Contact4* c = globalContactsOut+ dstIdx;
+				__global struct b3Contact4Data* c = globalContactsOut+ dstIdx;
 				c->m_worldNormal = normal;
-				c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 				c->m_batchIdx = pairIndex;
 				int bodyA = concavePairsIn[pairIndex].x;
 				int bodyB = concavePairsIn[pairIndex].y;
@@ -1747,7 +1732,7 @@ __kernel void   clipFacesAndContactReductionKernel( __global int4* pairs,
                                                   __global const BodyData* rigidBodies,
                                                   __global const float4* separatingNormals,
                                                   __global const int* hasSeparatingAxis,
-                                                     __global Contact4* globalContactsOut,
+                                                     __global struct b3Contact4Data* globalContactsOut,
                                                   __global int4* clippingFacesOut,
                                                   __global float4* worldVertsA1,
                                                   __global float4* worldNormalsA1,
@@ -1860,7 +1845,7 @@ __kernel void   newContactReductionKernel( __global int4* pairs,
                                                   __global const BodyData* rigidBodies,
                                                   __global const float4* separatingNormals,
                                                   __global const int* hasSeparatingAxis,
-                                                   __global Contact4* globalContactsOut,
+                                                   __global struct b3Contact4Data* globalContactsOut,
                                                   __global int4* clippingFaces,
                                                   __global float4* worldVertsB2,
                                                   volatile __global int* nGlobalContactsOut,
@@ -1901,9 +1886,9 @@ __kernel void   newContactReductionKernel( __global int4* pairs,
 				if (dstIdx < numPairs)
 				{

-					__global Contact4* c = &globalContactsOut[dstIdx];
+					__global struct b3Contact4Data* c = &globalContactsOut[dstIdx];
 					c->m_worldNormal = normal;
-					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
 					c->m_batchIdx = pairIndex;
 					int bodyA = pairs[pairIndex].x;
 					int bodyB = pairs[pairIndex].y;
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
@@ -2,17 +2,11 @@
 static const char* satKernelsCL= \
 "//keep this enum in sync with the CPU version (in btCollidable.h)\n"
 "//written by Erwin Coumans\n"
-"\n"
-"\n"
 "#define SHAPE_CONVEX_HULL 3\n"
 "#define SHAPE_CONCAVE_TRIMESH 5\n"
 "#define TRIANGLE_NUM_CONVEX_FACES 5\n"
 "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
-"\n"
-"\n"
-"\n"
 "typedef unsigned int u32;\n"
-"\n"
 "///keep this in sync with btCollidable.h\n"
 "typedef struct\n"
 "{\n"
@@ -22,7 +16,6 @@ static const char* satKernelsCL= \
 "	int m_shapeIndex;\n"
 "	\n"
 "} btCollidableGpu;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4	m_childPosition;\n"
@@ -32,22 +25,17 @@ static const char* satKernelsCL= \
 "	int m_unused1;\n"
 "	int m_unused2;\n"
 "} btGpuChildShape;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_collidableIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} BodyData;\n"
-"\n"
-"\n"
 "typedef struct  \n"
 "{\n"
 "	float4		m_localCenter;\n"
@@ -59,13 +47,11 @@ static const char* satKernelsCL= \
 "	int	m_faceOffset;\n"
 "	int m_numFaces;\n"
 "	int	m_numVertices;\n"
-"\n"
 "	int m_vertexOffset;\n"
 "	int	m_uniqueEdgesOffset;\n"
 "	int	m_numUniqueEdges;\n"
 "	int m_unused;\n"
 "} ConvexPolyhedronCL;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	union\n"
@@ -81,35 +67,27 @@ static const char* satKernelsCL= \
 "		int			m_maxIndices[4];\n"
 "	};\n"
 "} btAabbCL;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_plane;\n"
 "	int m_indexOffset;\n"
 "	int m_numIndices;\n"
 "} btGpuFace;\n"
-"\n"
 "#define make_float4 (float4)\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "	\n"
-"	\n"
 "//	float4 a1 = make_float4(a.xyz,0.f);\n"
 "//	float4 b1 = make_float4(b.xyz,0.f);\n"
-"\n"
 "//	return cross(a1,b1);\n"
-"\n"
 "//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n"
 "	\n"
 "	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n"
 "	\n"
 "	//return c;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -117,36 +95,24 @@ static const char* satKernelsCL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	v = make_float4(v.xyz,0.f);\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -157,7 +123,6 @@ static const char* satKernelsCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -174,41 +139,33 @@ static const char* satKernelsCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
 "{\n"
 "	return qtRotate( *orientation, *p ) + (*translation);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
 "	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
 "	return fastNormalize4( n );\n"
 "}\n"
-"\n"
 "inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
 "const float4* dir, const float4* vertices, float* min, float* max)\n"
 "{\n"
 "	min[0] = FLT_MAX;\n"
 "	max[0] = -FLT_MAX;\n"
 "	int numVerts = hull->m_numVertices;\n"
-"\n"
 "	const float4 localDir = qtInvRotate(orn,*dir);\n"
 "	float offset = dot(pos,*dir);\n"
 "	for(int i=0;i<numVerts;i++)\n"
@@ -228,14 +185,12 @@ static const char* satKernelsCL= \
 "	min[0] += offset;\n"
 "	max[0] += offset;\n"
 "}\n"
-"\n"
 "inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
 "const float4* dir, __global const float4* vertices, float* min, float* max)\n"
 "{\n"
 "	min[0] = FLT_MAX;\n"
 "	max[0] = -FLT_MAX;\n"
 "	int numVerts = hull->m_numVertices;\n"
-"\n"
 "	const float4 localDir = qtInvRotate(orn,*dir);\n"
 "	float offset = dot(pos,*dir);\n"
 "	for(int i=0;i<numVerts;i++)\n"
@@ -255,7 +210,6 @@ static const char* satKernelsCL= \
 "	min[0] += offset;\n"
 "	max[0] += offset;\n"
 "}\n"
-"\n"
 "inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA,const float4 ornA,\n"
 "	const float4 posB,const float4 ornB,\n"
@@ -265,28 +219,19 @@ static const char* satKernelsCL= \
 "	float Min1,Max1;\n"
 "	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n"
 "	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n"
-"\n"
 "	if(Max0<Min1 || Max1<Min0)\n"
 "		return false;\n"
-"\n"
 "	float d0 = Max0 - Min1;\n"
 "	float d1 = Max1 - Min0;\n"
 "	*depth = d0<d1 ? d0:d1;\n"
 "	return true;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "inline bool IsAlmostZero(const float4 v)\n"
 "{\n"
 "	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n"
 "		return false;\n"
 "	return true;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA1,\n"
 "	const float4 ornA,\n"
@@ -298,7 +243,6 @@ static const char* satKernelsCL= \
 "	const float4* uniqueEdgesA, \n"
 "	const btGpuFace* facesA,\n"
 "	const int*  indicesA,\n"
-"\n"
 "	__global const float4* verticesB, \n"
 "	__global const float4* uniqueEdgesB, \n"
 "	__global const btGpuFace* facesB,\n"
@@ -307,7 +251,6 @@ static const char* satKernelsCL= \
 "	float* dmin)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
-"\n"
 "	float4 posA = posA1;\n"
 "	posA.w = 0.f;\n"
 "	float4 posB = posB1;\n"
@@ -339,7 +282,6 @@ static const char* satKernelsCL= \
 "	}\n"
 "	return true;\n"
 "}\n"
-"\n"
 "bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA1,\n"
 "	const float4 ornA,\n"
@@ -358,7 +300,6 @@ static const char* satKernelsCL= \
 "	float* dmin)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
-"\n"
 "	float4 posA = posA1;\n"
 "	posA.w = 0.f;\n"
 "	float4 posB = posB1;\n"
@@ -390,9 +331,6 @@ static const char* satKernelsCL= \
 "	}\n"
 "	return true;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA1,\n"
 "	const float4 ornA,\n"
@@ -411,36 +349,28 @@ static const char* satKernelsCL= \
 "	float* dmin)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
-"\n"
 "	float4 posA = posA1;\n"
 "	posA.w = 0.f;\n"
 "	float4 posB = posB1;\n"
 "	posB.w = 0.f;\n"
-"\n"
 "	int curPlaneTests=0;\n"
-"\n"
 "	int curEdgeEdge = 0;\n"
 "	// Test edges\n"
 "	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
 "	{\n"
 "		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n"
 "		float4 edge0World = qtRotate(ornA,edge0);\n"
-"\n"
 "		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
 "		{\n"
 "			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n"
 "			float4 edge1World = qtRotate(ornB,edge1);\n"
-"\n"
-"\n"
 "			float4 crossje = cross3(edge0World,edge1World);\n"
-"\n"
 "			curEdgeEdge++;\n"
 "			if(!IsAlmostZero(crossje))\n"
 "			{\n"
 "				crossje = normalize3(crossje);\n"
 "				if (dot3F4(DeltaC2,crossje)<0)\n"
 "					crossje *= -1.f;\n"
-"\n"
 "				float dist;\n"
 "				bool result = true;\n"
 "				{\n"
@@ -456,10 +386,8 @@ static const char* satKernelsCL= \
 "					float d1 = Max1 - Min0;\n"
 "					dist = d0<d1 ? d0:d1;\n"
 "					result = true;\n"
-"\n"
 "				}\n"
 "				\n"
-"\n"
 "				if(dist<*dmin)\n"
 "				{\n"
 "					*dmin = dist;\n"
@@ -467,18 +395,14 @@ static const char* satKernelsCL= \
 "				}\n"
 "			}\n"
 "		}\n"
-"\n"
 "	}\n"
 "	\n"
-"	\n"
 "	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
 "	{\n"
 "		*sep = -(*sep);\n"
 "	}\n"
 "	return true;\n"
 "}\n"
-"\n"
-"\n"
 "inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA,const float4 ornA,\n"
 "	const float4 posB,const float4 ornB,\n"
@@ -488,17 +412,13 @@ static const char* satKernelsCL= \
 "	float Min1,Max1;\n"
 "	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);\n"
 "	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);\n"
-"\n"
 "	if(Max0<Min1 || Max1<Min0)\n"
 "		return false;\n"
-"\n"
 "	float d0 = Max0 - Min1;\n"
 "	float d1 = Max1 - Min0;\n"
 "	*depth = d0<d1 ? d0:d1;\n"
 "	return true;\n"
 "}\n"
-"\n"
-"\n"
 "bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA1,\n"
 "	const float4 ornA,\n"
@@ -513,14 +433,12 @@ static const char* satKernelsCL= \
 "	float* dmin)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
-"\n"
 "	float4 posA = posA1;\n"
 "	posA.w = 0.f;\n"
 "	float4 posB = posB1;\n"
 "	posB.w = 0.f;\n"
 "	\n"
 "	int curPlaneTests=0;\n"
-"\n"
 "	{\n"
 "		int numFacesA = hullA->m_numFaces;\n"
 "		// Test normals from hullA\n"
@@ -545,8 +463,6 @@ static const char* satKernelsCL= \
 "			}\n"
 "		}\n"
 "	}\n"
-"\n"
-"\n"
 "		if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
 "		{\n"
 "			*sep = -(*sep);\n"
@@ -554,10 +470,6 @@ static const char* satKernelsCL= \
 "	\n"
 "	return true;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA1,\n"
 "	const float4 ornA,\n"
@@ -572,29 +484,22 @@ static const char* satKernelsCL= \
 "	float* dmin)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
-"\n"
 "	float4 posA = posA1;\n"
 "	posA.w = 0.f;\n"
 "	float4 posB = posB1;\n"
 "	posB.w = 0.f;\n"
-"\n"
 "	int curPlaneTests=0;\n"
-"\n"
 "	int curEdgeEdge = 0;\n"
 "	// Test edges\n"
 "	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
 "	{\n"
 "		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];\n"
 "		float4 edge0World = qtRotate(ornA,edge0);\n"
-"\n"
 "		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
 "		{\n"
 "			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];\n"
 "			float4 edge1World = qtRotate(ornB,edge1);\n"
-"\n"
-"\n"
 "			float4 crossje = cross3(edge0World,edge1World);\n"
-"\n"
 "			curEdgeEdge++;\n"
 "			if(!IsAlmostZero(crossje))\n"
 "			{\n"
@@ -617,10 +522,8 @@ static const char* satKernelsCL= \
 "					float d1 = Max1 - Min0;\n"
 "					dist = d0<d1 ? d0:d1;\n"
 "					result = true;\n"
-"\n"
 "				}\n"
 "				\n"
-"\n"
 "				if(dist<*dmin)\n"
 "				{\n"
 "					*dmin = dist;\n"
@@ -628,18 +531,14 @@ static const char* satKernelsCL= \
 "				}\n"
 "			}\n"
 "		}\n"
-"\n"
 "	}\n"
 "	\n"
-"	\n"
 "	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
 "	{\n"
 "		*sep = -(*sep);\n"
 "	}\n"
 "	return true;\n"
 "}\n"
-"\n"
-"\n"
 "// work-in-progress\n"
 "__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n"
 "																					__global const BodyData* rigidBodies, \n"
@@ -656,13 +555,11 @@ static const char* satKernelsCL= \
 "																					int numCompoundPairs\n"
 "																					)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	if (i<numCompoundPairs)\n"
 "	{\n"
 "		int bodyIndexA = gpuCompoundPairs[i].x;\n"
 "		int bodyIndexB = gpuCompoundPairs[i].y;\n"
-"\n"
 "		int childShapeIndexA = gpuCompoundPairs[i].z;\n"
 "		int childShapeIndexB = gpuCompoundPairs[i].w;\n"
 "		\n"
@@ -711,12 +608,10 @@ static const char* satKernelsCL= \
 "		int shapeTypeA = collidables[collidableIndexA].m_shapeType;\n"
 "		int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
 "	\n"
-"\n"
 "		if ((shapeTypeA != SHAPE_CONVEX_HULL) || (shapeTypeB != SHAPE_CONVEX_HULL))\n"
 "		{\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		int hasSeparatingAxis = 5;\n"
 "							\n"
 "		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
@@ -737,7 +632,6 @@ static const char* satKernelsCL= \
 "		} else\n"
 "		{\n"
 "			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
-"\n"
 "			if (!sepB)\n"
 "			{\n"
 "				hasSeparatingAxis = 0;\n"
@@ -756,7 +650,6 @@ static const char* satKernelsCL= \
 "	}\n"
 "		\n"
 "}\n"
-"\n"
 "// work-in-progress\n"
 "__kernel void   findCompoundPairsKernel( __global const int4* pairs, \n"
 "	__global const BodyData* rigidBodies, \n"
@@ -774,48 +667,36 @@ static const char* satKernelsCL= \
 "	int maxNumCompoundPairsCapacity\n"
 "	)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
-"\n"
 "	if (i<numPairs)\n"
 "	{\n"
 "		int bodyIndexA = pairs[i].x;\n"
 "		int bodyIndexB = pairs[i].y;\n"
-"\n"
 "		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
 "		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
-"\n"
 "		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
 "		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
-"\n"
-"\n"
 "		//once the broadphase avoids static-static pairs, we can remove this test\n"
 "		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
 "		{\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
 "		{\n"
-"\n"
 "			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) \n"
 "			{\n"
-"\n"
 "				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;\n"
 "				for (int c=0;c<numChildrenA;c++)\n"
 "				{\n"
 "					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;\n"
 "					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
-"\n"
 "					float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
 "					float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
 "					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
 "					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
 "					float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
 "					float4 newOrnA = qtMul(ornA,childOrnA);\n"
-"\n"
 "					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;\n"
-"\n"
 "					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
 "					{\n"
 "						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
@@ -829,9 +710,7 @@ static const char* satKernelsCL= \
 "							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
 "							float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
 "							float4 newOrnB = qtMul(ornB,childOrnB);\n"
-"\n"
 "							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
-"\n"
 "							if (1)\n"
 "							{\n"
 "								int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
@@ -847,7 +726,6 @@ static const char* satKernelsCL= \
 "								float4 ornB =newOrnB;\n"
 "								float4 c1 = transform(&c1local,&posB,&ornB);\n"
 "								const float4 DeltaC2 = c0 - c1;\n"
-"\n"
 "								{//\n"
 "									int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
 "									if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
@@ -875,7 +753,6 @@ static const char* satKernelsCL= \
 "							float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
 "							float4 c1 = transform(&c1local,&posB,&ornB);\n"
 "							const float4 DeltaC2 = c0 - c1;\n"
-"\n"
 "							{\n"
 "								int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
 "								if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
@@ -902,12 +779,8 @@ static const char* satKernelsCL= \
 "					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
 "					float4 newPosB = qtRotate(ornB,childPosB)+posB;\n"
 "					float4 newOrnB = qtMul(ornB,childOrnB);\n"
-"\n"
 "					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
-"\n"
-"\n"
 "					//////////////////////////////////////\n"
-"\n"
 "					if (1)\n"
 "					{\n"
 "						int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
@@ -938,7 +811,6 @@ static const char* satKernelsCL= \
 "		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
 "	}//i<numPairs\n"
 "}\n"
-"\n"
 "// work-in-progress\n"
 "__kernel void   findSeparatingAxisKernel( __global const int4* pairs, \n"
 "																					__global const BodyData* rigidBodies, \n"
@@ -954,16 +826,13 @@ static const char* satKernelsCL= \
 "																					int numPairs\n"
 "																					)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	\n"
 "	if (i<numPairs)\n"
 "	{\n"
 "	\n"
-"	\n"
 "		int bodyIndexA = pairs[i].x;\n"
 "		int bodyIndexB = pairs[i].y;\n"
-"\n"
 "		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
 "		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
 "	\n"
@@ -978,7 +847,6 @@ static const char* satKernelsCL= \
 "			return;\n"
 "		}\n"
 "		\n"
-"\n"
 "		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
 "		{\n"
 "			hasSeparatingAxis[i] = 0;\n"
@@ -990,11 +858,8 @@ static const char* satKernelsCL= \
 "			hasSeparatingAxis[i] = 0;\n"
 "			return;\n"
 "		}\n"
-"\n"
 "		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
-"\n"
 "		float dmin = FLT_MAX;\n"
-"\n"
 "		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
 "		posA.w = 0.f;\n"
 "		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
@@ -1024,7 +889,6 @@ static const char* satKernelsCL= \
 "																									DeltaC2,\n"
 "																									vertices,uniqueEdges,faces,\n"
 "																									indices,&sepNormal,&dmin);\n"
-"\n"
 "			if (!sepB)\n"
 "			{\n"
 "				hasSeparatingAxis[i] = 0;\n"
@@ -1047,12 +911,7 @@ static const char* satKernelsCL= \
 "		}\n"
 "		\n"
 "	}\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "// work-in-progress\n"
 "__kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,\n"
 "																					__global const BodyData* rigidBodies,\n"
@@ -1068,30 +927,22 @@ static const char* satKernelsCL= \
 "																					int numConcavePairs\n"
 "																					)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numConcavePairs)\n"
 "		return;\n"
 "	int pairIdx = i;\n"
-"\n"
 "	int bodyIndexA = concavePairs[i].x;\n"
 "	int bodyIndexB = concavePairs[i].y;\n"
-"\n"
 "	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
 "	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
-"\n"
 "	int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
 "	int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
-"\n"
 "	if (collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL&&\n"
 "		collidables[collidableIndexB].m_shapeType!=SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
 "	{\n"
 "		concavePairs[pairIdx].w = -1;\n"
 "		return;\n"
 "	}\n"
-"\n"
-"\n"
-"\n"
 "	int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
 "	int numActualConcaveConvexTests = 0;\n"
 "	\n"
@@ -1100,12 +951,10 @@ static const char* satKernelsCL= \
 "	bool overlap = false;\n"
 "	\n"
 "	ConvexPolyhedronCL convexPolyhedronA;\n"
-"\n"
 "	//add 3 vertices of the triangle\n"
 "	convexPolyhedronA.m_numVertices = 3;\n"
 "	convexPolyhedronA.m_vertexOffset = 0;\n"
 "	float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
-"\n"
 "	btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
 "	float4 triMinAabb, triMaxAabb;\n"
 "	btAabbCL triAabb;\n"
@@ -1122,9 +971,7 @@ static const char* satKernelsCL= \
 "			\n"
 "		triAabb.m_min = min(triAabb.m_min,vert);		\n"
 "		triAabb.m_max = max(triAabb.m_max,vert);		\n"
-"\n"
 "	}\n"
-"\n"
 "	overlap = true;\n"
 "	overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
 "	overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
@@ -1135,10 +982,8 @@ static const char* satKernelsCL= \
 "		float dmin = FLT_MAX;\n"
 "		int hasSeparatingAxis=5;\n"
 "		float4 sepAxis=make_float4(1,2,3,4);\n"
-"\n"
 "		int localCC=0;\n"
 "		numActualConcaveConvexTests++;\n"
-"\n"
 "		//a triangle has 3 unique edges\n"
 "		convexPolyhedronA.m_numUniqueEdges = 3;\n"
 "		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
@@ -1147,8 +992,6 @@ static const char* satKernelsCL= \
 "		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
 "		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
 "		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
-"\n"
-"\n"
 "		convexPolyhedronA.m_faceOffset = 0;\n"
 "                                  \n"
 "		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
@@ -1157,7 +1000,6 @@ static const char* satKernelsCL= \
 "		int indicesA[3+3+2+2+2];\n"
 "		int curUsedIndices=0;\n"
 "		int fidx=0;\n"
-"\n"
 "		//front size of triangle\n"
 "		{\n"
 "			facesA[fidx].m_indexOffset=curUsedIndices;\n"
@@ -1189,7 +1031,6 @@ static const char* satKernelsCL= \
 "			facesA[fidx].m_numIndices=3;\n"
 "		}\n"
 "		fidx++;\n"
-"\n"
 "		bool addEdgePlanes = true;\n"
 "		if (addEdgePlanes)\n"
 "		{\n"
@@ -1202,7 +1043,6 @@ static const char* satKernelsCL= \
 "                                            \n"
 "				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
 "				float c = -dot(edgeNormal,v0);\n"
-"\n"
 "				facesA[fidx].m_numIndices = 2;\n"
 "				facesA[fidx].m_indexOffset=curUsedIndices;\n"
 "				indicesA[curUsedIndices++]=i;\n"
@@ -1218,22 +1058,15 @@ static const char* satKernelsCL= \
 "		}\n"
 "		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
 "		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
-"\n"
-"\n"
 "		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
 "		posA.w = 0.f;\n"
 "		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
 "		posB.w = 0.f;\n"
-"\n"
 "		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
 "		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
 "		\n"
-"		\n"
-"\n"
-"\n"
 "		///////////////////\n"
 "		///compound shape support\n"
-"\n"
 "		if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
 "		{\n"
 "			int compoundChild = concavePairs[pairIdx].w;\n"
@@ -1248,14 +1081,11 @@ static const char* satKernelsCL= \
 "			shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
 "		}\n"
 "		//////////////////\n"
-"\n"
 "		float4 c0local = convexPolyhedronA.m_localCenter;\n"
 "		float4 c0 = transform(&c0local, &posA, &ornA);\n"
 "		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
 "		float4 c1 = transform(&c1local,&posB,&ornB);\n"
 "		const float4 DeltaC2 = c0 - c1;\n"
-"\n"
-"\n"
 "		bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
 "												posA,ornA,\n"
 "												posB,ornB,\n"
@@ -1276,7 +1106,6 @@ static const char* satKernelsCL= \
 "												vertices,uniqueEdges,faces,indices,\n"
 "												verticesA,uniqueEdgesA,facesA,indicesA,\n"
 "												&sepAxis,&dmin);\n"
-"\n"
 "			if (!sepB)\n"
 "			{\n"
 "				hasSeparatingAxis = 0;\n"
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
@@ -2,59 +2,45 @@
 static const char* boundSearchKernelsCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "typedef unsigned int u32;\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
 "#define GET_GROUP_SIZE get_local_size(0)\n"
 "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	u32 m_key; \n"
 "	u32 m_value;\n"
 "}SortData;\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	u32 m_nSrc;\n"
 "	u32 m_nDst;\n"
 "	u32 m_padding[2];\n"
 "} ConstBuffer;\n"
-"\n"
-"\n"
-"\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "__kernel\n"
 "void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
 "					unsigned int nSrc, unsigned int nDst)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nSrc )\n"
 "	{\n"
 "		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
 "		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"\n"
 "		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
 "		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"\n"
 "		if( iData.m_key != jData.m_key )\n"
 "		{\n"
 "//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
@@ -65,23 +51,18 @@ static const char* boundSearchKernelsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "__kernel\n"
 "void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
 "					unsigned int nSrc, unsigned int nDst)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX+1;\n"
-"\n"
 "	if( gIdx < nSrc+1 )\n"
 "	{\n"
 "		SortData first; first.m_key = 0; first.m_value = 0;\n"
 "		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"\n"
 "		SortData iData = src[gIdx-1];\n"
 "		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"\n"
 "		if( iData.m_key != jData.m_key )\n"
 "		{\n"
 "			u32 k = iData.m_key;\n"
@@ -91,7 +72,6 @@ static const char* boundSearchKernelsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "__kernel\n"
 "void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
@@ -99,11 +79,9 @@ static const char* boundSearchKernelsCL= \
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	\n"
-"\n"
 "	if( gIdx < nDst )\n"
 "	{\n"
 "		C[gIdx] = A[gIdx] - B[gIdx];\n"
 "	}\n"
 "}\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
@@ -2,23 +2,18 @@
 static const char* fillKernelsCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"\n"
 "typedef unsigned int u32;\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
@@ -28,11 +23,9 @@ static const char* fillKernelsCL= \
 "#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
 "#define AtomInc(x) atom_inc(&(x))\n"
 "#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"\n"
 "#define make_uint4 (uint4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	union\n"
@@ -45,66 +38,54 @@ static const char* fillKernelsCL= \
 "	int m_n;\n"
 "	int m_padding[2];\n"
 "} ConstBuffer;\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < num_elements )\n"
 "	{\n"
 "		dstInt[ offset+gIdx ] = value;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < num_elements )\n"
 "	{\n"
 "		dstFloat[ offset+gIdx ] = value;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < num )\n"
 "	{\n"
 "		dstInt[ offset+gIdx ] = value;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < num )\n"
 "	{\n"
 "		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(64,1,1)))\n"
 "void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < num )\n"
 "	{\n"
 "		dstInt4[ offset+gIdx ] = value;\n"
 "	}\n"
 "}\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
@@ -2,33 +2,27 @@
 static const char* prefixScanKernelsCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "typedef unsigned int u32;\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
 "#define GET_GROUP_SIZE get_local_size(0)\n"
 "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"\n"
 "// takahiro end\n"
 "#define WG_SIZE 128 \n"
 "#define m_numElems x\n"
 "#define m_numBlocks y\n"
 "#define m_numScanBlocks z\n"
-"\n"
 "/*typedef struct\n"
 "{\n"
 "	uint m_numElems;\n"
@@ -37,7 +31,6 @@ static const char* prefixScanKernelsCL= \
 "	uint m_padding[1];\n"
 "} ConstBuffer;\n"
 "*/\n"
-"\n"
 "u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
 "{\n"
 "	u32 blocksum;\n"
@@ -52,17 +45,13 @@ static const char* prefixScanKernelsCL= \
 "            data[bi] += data[ai];\n"
 "        }\n"
 "	}\n"
-"\n"
 "    GROUP_LDS_BARRIER;\n"
-"\n"
 "    if( lIdx == 0 )\n"
 "	{\n"
 "		blocksum = data[ n-1 ];\n"
 "        data[ n-1 ] = 0;\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	offset >>= 1;\n"
 "    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
 "    {\n"
@@ -77,27 +66,20 @@ static const char* prefixScanKernelsCL= \
 "        }\n"
 "	}\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	return blocksum;\n"
 "}\n"
-"\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "__kernel\n"
 "void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
 "		uint4 cb)\n"
 "{\n"
 "	__local u32 ldsData[WG_SIZE*2];\n"
-"\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
-"\n"
 "	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
 "	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"\n"
 "	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"\n"
 "	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
-"\n"
 "	if( (2*gIdx) < cb.m_numElems )\n"
 "    {\n"
 "        dst[2*gIdx]     = ldsData[2*lIdx];\n"
@@ -107,25 +89,20 @@ static const char* prefixScanKernelsCL= \
 "        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
 "    }\n"
 "}\n"
-"\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "__kernel\n"
 "void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
 "{\n"
 "	const u32 blockSize = WG_SIZE*2;\n"
-"\n"
 "	int myIdx = GET_GROUP_IDX+1;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
-"\n"
 "	u32 iBlockSum = blockSum[myIdx];\n"
-"\n"
 "	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
 "	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
 "	{\n"
 "		dst[i] += iBlockSum;\n"
 "	}\n"
 "}\n"
-"\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "__kernel\n"
 "void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
@@ -134,21 +111,16 @@ static const char* prefixScanKernelsCL= \
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	int lSize = GET_GROUP_SIZE;\n"
-"\n"
 "	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
 "	{\n"
 "		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"\n"
 "	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
 "	{\n"
 "		dst[i] = ldsData[i];\n"
 "	}\n"
-"\n"
 "	if( gIdx == 0 )\n"
 "	{\n"
 "		dst[cb.m_numBlocks] = sum;\n"
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
@@ -2,33 +2,27 @@
 static const char* prefixScanKernelsFloat4CL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "typedef unsigned int u32;\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
 "#define GET_GROUP_SIZE get_local_size(0)\n"
 "#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"\n"
 "// takahiro end\n"
 "#define WG_SIZE 128 \n"
 "#define m_numElems x\n"
 "#define m_numBlocks y\n"
 "#define m_numScanBlocks z\n"
-"\n"
 "/*typedef struct\n"
 "{\n"
 "	uint m_numElems;\n"
@@ -37,7 +31,6 @@ static const char* prefixScanKernelsFloat4CL= \
 "	uint m_padding[1];\n"
 "} ConstBuffer;\n"
 "*/\n"
-"\n"
 "float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
 "{\n"
 "	float4 blocksum;\n"
@@ -52,17 +45,13 @@ static const char* prefixScanKernelsFloat4CL= \
 "            data[bi] += data[ai];\n"
 "        }\n"
 "	}\n"
-"\n"
 "    GROUP_LDS_BARRIER;\n"
-"\n"
 "    if( lIdx == 0 )\n"
 "	{\n"
 "		blocksum = data[ n-1 ];\n"
 "    data[ n-1 ] = 0;\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	offset >>= 1;\n"
 "    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
 "    {\n"
@@ -77,27 +66,20 @@ static const char* prefixScanKernelsFloat4CL= \
 "        }\n"
 "	}\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	return blocksum;\n"
 "}\n"
-"\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "__kernel\n"
 "void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
 "{\n"
 "	__local float4 ldsData[WG_SIZE*2];\n"
-"\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
-"\n"
 "	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
 "	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"\n"
 "	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"\n"
 "	if( lIdx == 0 ) \n"
 "		sumBuffer[GET_GROUP_IDX] = sum;\n"
-"\n"
 "	if( (2*gIdx) < cb.m_numElems )\n"
 "    {\n"
 "        dst[2*gIdx]     = ldsData[2*lIdx];\n"
@@ -107,25 +89,20 @@ static const char* prefixScanKernelsFloat4CL= \
 "        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
 "    }\n"
 "}\n"
-"\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "__kernel\n"
 "void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
 "{\n"
 "	const u32 blockSize = WG_SIZE*2;\n"
-"\n"
 "	int myIdx = GET_GROUP_IDX+1;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
-"\n"
 "	float4 iBlockSum = blockSum[myIdx];\n"
-"\n"
 "	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
 "	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
 "	{\n"
 "		dst[i] += iBlockSum;\n"
 "	}\n"
 "}\n"
-"\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "__kernel\n"
 "void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
@@ -134,21 +111,16 @@ static const char* prefixScanKernelsFloat4CL= \
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	int lSize = GET_GROUP_SIZE;\n"
-"\n"
 "	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
 "	{\n"
 "		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"\n"
 "	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
 "	{\n"
 "		dst[i] = ldsData[i];\n"
 "	}\n"
-"\n"
 "	if( gIdx == 0 )\n"
 "	{\n"
 "		dst[cb.m_numBlocks] = sum;\n"
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
@@ -3,24 +3,19 @@ static const char* radixSort32KernelsCL= \
 "/*\n"
 "Bullet Continuous Collision Detection and Physics Library\n"
 "Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org\n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Author Takahiro Harada\n"
-"\n"
-"\n"
 "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"\n"
 "typedef unsigned int u32;\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
@@ -31,38 +26,27 @@ static const char* radixSort32KernelsCL= \
 "#define AtomInc(x) atom_inc(&(x))\n"
 "#define AtomInc1(x, out) out = atom_inc(&(x))\n"
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
-"\n"
 "#define make_uint4 (uint4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
 "#define WG_SIZE 64\n"
 "#define ELEMENTS_PER_WORK_ITEM (256/WG_SIZE)\n"
 "#define BITS_PER_PASS 4\n"
 "#define NUM_BUCKET (1<<BITS_PER_PASS)\n"
 "typedef uchar u8;\n"
-"\n"
 "//	this isn't optimization for VLIW. But just reducing writes. \n"
 "#define USE_2LEVEL_REDUCE 1\n"
-"\n"
 "//#define CHECK_BOUNDARY 1\n"
-"\n"
 "//#define NV_GPU 1\n"
-"\n"
-"\n"
 "//	Cypress\n"
 "#define nPerWI 16\n"
 "//	Cayman\n"
 "//#define nPerWI 20\n"
-"\n"
 "#define m_n x\n"
 "#define m_nWGs y\n"
 "#define m_startBit z\n"
 "#define m_nBlocksPerWG w\n"
-"\n"
 "/*\n"
 "typedef struct\n"
 "{\n"
@@ -72,14 +56,11 @@ static const char* radixSort32KernelsCL= \
 "	int m_nBlocksPerWG;\n"
 "} ConstBuffer;\n"
 "*/\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	unsigned int m_key;\n"
 "	unsigned int m_value;\n"
 "} SortDataCL;\n"
-"\n"
-"\n"
 "uint prefixScanVectorEx( uint4* data )\n"
 "{\n"
 "	u32 sum = 0;\n"
@@ -97,16 +78,13 @@ static const char* radixSort32KernelsCL= \
 "	sum += tmp;\n"
 "	return sum;\n"
 "}\n"
-"\n"
 "u32 localPrefixSum( u32 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory, int wgSize /*64 or 128*/ )\n"
 "{\n"
 "	{	//	Set data\n"
 "		sorterSharedMemory[lIdx] = 0;\n"
 "		sorterSharedMemory[lIdx+wgSize] = pData;\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	{	//	Prefix sum\n"
 "		int idx = 2*lIdx + (wgSize+1);\n"
 "#if defined(USE_2LEVEL_REDUCE)\n"
@@ -118,13 +96,11 @@ static const char* radixSort32KernelsCL= \
 "			u2 = sorterSharedMemory[idx-1];\n"
 "			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
 "			GROUP_MEM_FENCE;\n"
-"\n"
 "			u0 = sorterSharedMemory[idx-12];\n"
 "			u1 = sorterSharedMemory[idx-8];\n"
 "			u2 = sorterSharedMemory[idx-4];\n"
 "			AtomAdd( sorterSharedMemory[idx], u0+u1+u2 );			\n"
 "			GROUP_MEM_FENCE;\n"
-"\n"
 "			u0 = sorterSharedMemory[idx-48];\n"
 "			u1 = sorterSharedMemory[idx-32];\n"
 "			u2 = sorterSharedMemory[idx-16];\n"
@@ -135,7 +111,6 @@ static const char* radixSort32KernelsCL= \
 "				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
 "				GROUP_MEM_FENCE;\n"
 "			}\n"
-"\n"
 "			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
 "			GROUP_MEM_FENCE;\n"
 "		}\n"
@@ -159,20 +134,16 @@ static const char* radixSort32KernelsCL= \
 "				sorterSharedMemory[idx] += sorterSharedMemory[idx-64];\n"
 "				GROUP_MEM_FENCE;\n"
 "			}\n"
-"\n"
 "			sorterSharedMemory[idx-1] += sorterSharedMemory[idx-2];\n"
 "			GROUP_MEM_FENCE;\n"
 "		}\n"
 "#endif\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	*totalSum = sorterSharedMemory[wgSize*2-1];\n"
 "	u32 addValue = sorterSharedMemory[lIdx+wgSize-1];\n"
 "	return addValue;\n"
 "}\n"
-"\n"
 "//__attribute__((reqd_work_group_size(128,1,1)))\n"
 "uint4 localPrefixSum128V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
 "{\n"
@@ -180,8 +151,6 @@ static const char* radixSort32KernelsCL= \
 "	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 128 );\n"
 "	return pData + make_uint4( rank, rank, rank, rank );\n"
 "}\n"
-"\n"
-"\n"
 "//__attribute__((reqd_work_group_size(64,1,1)))\n"
 "uint4 localPrefixSum64V( uint4 pData, uint lIdx, uint* totalSum, __local u32* sorterSharedMemory )\n"
 "{\n"
@@ -189,28 +158,18 @@ static const char* radixSort32KernelsCL= \
 "	u32 rank = localPrefixSum( s4, lIdx, totalSum, sorterSharedMemory, 64 );\n"
 "	return pData + make_uint4( rank, rank, rank, rank );\n"
 "}\n"
-"\n"
 "u32 unpack4Key( u32 key, int keyIdx ){ return (key>>(keyIdx*8)) & 0xff;}\n"
-"\n"
 "u32 bit8Scan(u32 v)\n"
 "{\n"
 "	return (v<<8) + (v<<16) + (v<<24);\n"
 "}\n"
-"\n"
 "//===\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define MY_HISTOGRAM(idx) localHistogramMat[(idx)*WG_SIZE+lIdx]\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void StreamCountKernel( __global u32* gSrc, __global u32* histogramOut, int4 cb )\n"
 "{\n"
 "	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
-"\n"
 "	u32 gIdx = GET_GLOBAL_IDX;\n"
 "	u32 lIdx = GET_LOCAL_IDX;\n"
 "	u32 wgIdx = GET_GROUP_IDX;\n"
@@ -219,21 +178,15 @@ static const char* radixSort32KernelsCL= \
 "	const int n = cb.m_n;\n"
 "	const int nWGs = cb.m_nWGs;\n"
 "	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"\n"
 "	for(int i=0; i<NUM_BUCKET; i++)\n"
 "	{\n"
 "		MY_HISTOGRAM(i) = 0;\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
 "	u32 localKey;\n"
-"\n"
 "	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"\n"
 "	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"\n"
 "	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
 "	{\n"
 "		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
@@ -254,7 +207,6 @@ static const char* radixSort32KernelsCL= \
 "			}\n"
 "		}\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
 "	\n"
 "	if( lIdx < NUM_BUCKET )\n"
@@ -267,13 +219,11 @@ static const char* radixSort32KernelsCL= \
 "		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void StreamCountSortDataKernel( __global SortDataCL* gSrc, __global u32* histogramOut, int4  cb )\n"
 "{\n"
 "	__local u32 localHistogramMat[NUM_BUCKET*WG_SIZE];\n"
-"\n"
 "	u32 gIdx = GET_GLOBAL_IDX;\n"
 "	u32 lIdx = GET_LOCAL_IDX;\n"
 "	u32 wgIdx = GET_GROUP_IDX;\n"
@@ -282,21 +232,15 @@ static const char* radixSort32KernelsCL= \
 "	const int n = cb.m_n;\n"
 "	const int nWGs = cb.m_nWGs;\n"
 "	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"\n"
 "	for(int i=0; i<NUM_BUCKET; i++)\n"
 "	{\n"
 "		MY_HISTOGRAM(i) = 0;\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
 "	u32 localKey;\n"
-"\n"
 "	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"\n"
 "	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"\n"
 "	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
 "	{\n"
 "		//	MY_HISTOGRAM( localKeys.x ) ++ is much expensive than atomic add as it requires read and write while atomics can just add on AMD\n"
@@ -317,7 +261,6 @@ static const char* radixSort32KernelsCL= \
 "			}\n"
 "		}\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
 "	\n"
 "	if( lIdx < NUM_BUCKET )\n"
@@ -330,20 +273,16 @@ static const char* radixSort32KernelsCL= \
 "		histogramOut[lIdx*nWGs+wgIdx] = sum;\n"
 "	}\n"
 "}\n"
-"\n"
 "#define nPerLane (nPerWI/4)\n"
-"\n"
 "//	NUM_BUCKET*nWGs < 128*nPerWI\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(128,1,1)))\n"
 "void PrefixScanKernel( __global u32* wHistogram1, int4  cb )\n"
 "{\n"
 "	__local u32 ldsTopScanData[128*2];\n"
-"\n"
 "	u32 lIdx = GET_LOCAL_IDX;\n"
 "	u32 wgIdx = GET_GROUP_IDX;\n"
 "	const int nWGs = cb.m_nWGs;\n"
-"\n"
 "	u32 data[nPerWI];\n"
 "	for(int i=0; i<nPerWI; i++)\n"
 "	{\n"
@@ -351,9 +290,7 @@ static const char* radixSort32KernelsCL= \
 "		if( (nPerWI*lIdx+i) < NUM_BUCKET*nWGs )\n"
 "			data[i] = wHistogram1[nPerWI*lIdx+i];\n"
 "	}\n"
-"\n"
 "	uint4 myData = make_uint4(0,0,0,0);\n"
-"\n"
 "	for(int i=0; i<nPerLane; i++)\n"
 "	{\n"
 "		myData.x += data[nPerLane*0+i];\n"
@@ -361,10 +298,8 @@ static const char* radixSort32KernelsCL= \
 "		myData.z += data[nPerLane*2+i];\n"
 "		myData.w += data[nPerLane*3+i];\n"
 "	}\n"
-"\n"
 "	uint totalSum;\n"
 "	uint4 scanned = localPrefixSum128V( myData, lIdx, &totalSum, ldsTopScanData );\n"
-"\n"
 "//	for(int j=0; j<4; j++) //	somehow it introduces a lot of branches\n"
 "	{	int j = 0;\n"
 "		u32 sum = 0;\n"
@@ -402,7 +337,6 @@ static const char* radixSort32KernelsCL= \
 "			sum += tmp;\n"
 "		}\n"
 "	}\n"
-"\n"
 "	for(int i=0; i<nPerLane; i++)\n"
 "	{\n"
 "		data[nPerLane*0+i] += scanned.x;\n"
@@ -410,7 +344,6 @@ static const char* radixSort32KernelsCL= \
 "		data[nPerLane*2+i] += scanned.z;\n"
 "		data[nPerLane*3+i] += scanned.w;\n"
 "	}\n"
-"\n"
 "	for(int i=0; i<nPerWI; i++)\n"
 "	{\n"
 "		int index = nPerWI*lIdx+i;\n"
@@ -418,7 +351,6 @@ static const char* radixSort32KernelsCL= \
 "			wHistogram1[nPerWI*lIdx+i] = data[i];\n"
 "	}\n"
 "}\n"
-"\n"
 "//	4 scan, 4 exchange\n"
 "void sort4Bits(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
 "{\n"
@@ -433,26 +365,20 @@ static const char* radixSort32KernelsCL= \
 "			uint4 localAddr = make_uint4(lIdx*4+0,lIdx*4+1,lIdx*4+2,lIdx*4+3);\n"
 "			uint4 dstAddr = localAddr - prefixSum + make_uint4( total, total, total, total );\n"
 "			dstAddr = SELECT_UINT4( prefixSum, dstAddr, cmpResult != make_uint4(0, 0, 0, 0) );\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "			ldsSortData[dstAddr.x] = sortData[0];\n"
 "			ldsSortData[dstAddr.y] = sortData[1];\n"
 "			ldsSortData[dstAddr.z] = sortData[2];\n"
 "			ldsSortData[dstAddr.w] = sortData[3];\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "			sortData[0] = ldsSortData[localAddr.x];\n"
 "			sortData[1] = ldsSortData[localAddr.y];\n"
 "			sortData[2] = ldsSortData[localAddr.z];\n"
 "			sortData[3] = ldsSortData[localAddr.w];\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "//	2 scan, 2 exchange\n"
 "void sort4Bits1(u32 sortData[4], int startBit, int lIdx, __local u32* ldsSortData)\n"
 "{\n"
@@ -462,7 +388,6 @@ static const char* radixSort32KernelsCL= \
 "			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
 "			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
 "			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
-"\n"
 "		u32 key4;\n"
 "		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
 "		{\n"
@@ -470,22 +395,17 @@ static const char* radixSort32KernelsCL= \
 "			sKeyPacked[1] |= 1<<(8*b.y);\n"
 "			sKeyPacked[2] |= 1<<(8*b.z);\n"
 "			sKeyPacked[3] |= 1<<(8*b.w);\n"
-"\n"
 "			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
 "		}\n"
-"\n"
 "		u32 rankPacked;\n"
 "		u32 sumPacked;\n"
 "		{\n"
 "			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
 "		}\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		u32 newOffset[4] = { 0,0,0,0 };\n"
 "		{\n"
 "			u32 sumScanned = bit8Scan( sumPacked );\n"
-"\n"
 "			u32 scannedKeys[4];\n"
 "			scannedKeys[0] = 1<<(8*b.x);\n"
 "			scannedKeys[1] = 1<<(8*b.y);\n"
@@ -500,7 +420,6 @@ static const char* radixSort32KernelsCL= \
 "					sum4 += tmp;\n"
 "				}\n"
 "			}\n"
-"\n"
 "			{\n"
 "				u32 sumPlusRank = sumScanned + rankPacked;\n"
 "				{	u32 ie = b.x;\n"
@@ -521,31 +440,23 @@ static const char* radixSort32KernelsCL= \
 "				}\n"
 "			}\n"
 "		}\n"
-"\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		{\n"
 "			ldsSortData[newOffset[0]] = sortData[0];\n"
 "			ldsSortData[newOffset[1]] = sortData[1];\n"
 "			ldsSortData[newOffset[2]] = sortData[2];\n"
 "			ldsSortData[newOffset[3]] = sortData[3];\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "			u32 dstAddr = 4*lIdx;\n"
 "			sortData[0] = ldsSortData[dstAddr+0];\n"
 "			sortData[1] = ldsSortData[dstAddr+1];\n"
 "			sortData[2] = ldsSortData[dstAddr+2];\n"
 "			sortData[3] = ldsSortData[dstAddr+3];\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "#define SET_HISTOGRAM(setIdx, key) ldsSortData[(setIdx)*NUM_BUCKET+key]\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void SortAndScatterKernel( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
@@ -553,34 +464,25 @@ static const char* radixSort32KernelsCL= \
 "	__local u32 ldsSortData[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
 "	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
 "	__local u32 localHistogram[NUM_BUCKET*2];\n"
-"\n"
 "	u32 gIdx = GET_GLOBAL_IDX;\n"
 "	u32 lIdx = GET_LOCAL_IDX;\n"
 "	u32 wgIdx = GET_GROUP_IDX;\n"
 "	u32 wgSize = GET_GROUP_SIZE;\n"
-"\n"
 "	const int n = cb.m_n;\n"
 "	const int nWGs = cb.m_nWGs;\n"
 "	const int startBit = cb.m_startBit;\n"
 "	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"\n"
 "	if( lIdx < (NUM_BUCKET) )\n"
 "	{\n"
 "		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"\n"
 "	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
-"\n"
 "	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"\n"
 "	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
 "	{\n"
 "		u32 myHistogram = 0;\n"
-"\n"
 "		u32 sortData[ELEMENTS_PER_WORK_ITEM];\n"
 "		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
 "#if defined(CHECK_BOUNDARY)\n"
@@ -588,13 +490,10 @@ static const char* radixSort32KernelsCL= \
 "#else\n"
 "			sortData[i] = gSrc[ addr+i ];\n"
 "#endif\n"
-"\n"
 "		sort4Bits(sortData, startBit, lIdx, ldsSortData);\n"
-"\n"
 "		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
 "		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
 "			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
-"\n"
 "		{	//	create histogram\n"
 "			u32 setIdx = lIdx/16;\n"
 "			if( lIdx < NUM_BUCKET )\n"
@@ -603,12 +502,10 @@ static const char* radixSort32KernelsCL= \
 "			}\n"
 "			ldsSortData[lIdx] = 0;\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
 "#if defined(CHECK_BOUNDARY)\n"
 "				if( addr+i < n )\n"
 "#endif\n"
-"\n"
 "#if defined(NV_GPU)\n"
 "				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
 "#else\n"
@@ -629,13 +526,11 @@ static const char* radixSort32KernelsCL= \
 "				localHistogram[hIdx] = sum;\n"
 "			}\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "#if defined(USE_2LEVEL_REDUCE)\n"
 "			if( lIdx < NUM_BUCKET )\n"
 "			{\n"
 "				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
 "				GROUP_MEM_FENCE;\n"
-"\n"
 "				u32 u0, u1, u2;\n"
 "				u0 = localHistogram[hIdx-3];\n"
 "				u1 = localHistogram[hIdx-2];\n"
@@ -665,7 +560,6 @@ static const char* radixSort32KernelsCL= \
 "#endif\n"
 "			GROUP_LDS_BARRIER;\n"
 "		}\n"
-"\n"
 "		{\n"
 "			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
 "			{\n"
@@ -679,9 +573,7 @@ static const char* radixSort32KernelsCL= \
 "				gDst[ groupOffset + myIdx ] = sortData[ie];\n"
 "			}\n"
 "		}\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		if( lIdx < NUM_BUCKET )\n"
 "		{\n"
 "			localHistogramToCarry[lIdx] += myHistogram;\n"
@@ -689,7 +581,6 @@ static const char* radixSort32KernelsCL= \
 "		GROUP_LDS_BARRIER;\n"
 "	}\n"
 "}\n"
-"\n"
 "//	2 scan, 2 exchange\n"
 "void sort4Bits1KeyValue(u32 sortData[4], int sortVal[4], int startBit, int lIdx, __local u32* ldsSortData, __local int *ldsSortVal)\n"
 "{\n"
@@ -699,7 +590,6 @@ static const char* radixSort32KernelsCL= \
 "			(sortData[1]>>(startBit+ibit)) & 0x3, \n"
 "			(sortData[2]>>(startBit+ibit)) & 0x3, \n"
 "			(sortData[3]>>(startBit+ibit)) & 0x3);\n"
-"\n"
 "		u32 key4;\n"
 "		u32 sKeyPacked[4] = { 0, 0, 0, 0 };\n"
 "		{\n"
@@ -707,22 +597,17 @@ static const char* radixSort32KernelsCL= \
 "			sKeyPacked[1] |= 1<<(8*b.y);\n"
 "			sKeyPacked[2] |= 1<<(8*b.z);\n"
 "			sKeyPacked[3] |= 1<<(8*b.w);\n"
-"\n"
 "			key4 = sKeyPacked[0] + sKeyPacked[1] + sKeyPacked[2] + sKeyPacked[3];\n"
 "		}\n"
-"\n"
 "		u32 rankPacked;\n"
 "		u32 sumPacked;\n"
 "		{\n"
 "			rankPacked = localPrefixSum( key4, lIdx, &sumPacked, ldsSortData, WG_SIZE );\n"
 "		}\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		u32 newOffset[4] = { 0,0,0,0 };\n"
 "		{\n"
 "			u32 sumScanned = bit8Scan( sumPacked );\n"
-"\n"
 "			u32 scannedKeys[4];\n"
 "			scannedKeys[0] = 1<<(8*b.x);\n"
 "			scannedKeys[1] = 1<<(8*b.y);\n"
@@ -737,7 +622,6 @@ static const char* radixSort32KernelsCL= \
 "					sum4 += tmp;\n"
 "				}\n"
 "			}\n"
-"\n"
 "			{\n"
 "				u32 sumPlusRank = sumScanned + rankPacked;\n"
 "				{	u32 ie = b.x;\n"
@@ -758,42 +642,30 @@ static const char* radixSort32KernelsCL= \
 "				}\n"
 "			}\n"
 "		}\n"
-"\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		{\n"
 "			ldsSortData[newOffset[0]] = sortData[0];\n"
 "			ldsSortData[newOffset[1]] = sortData[1];\n"
 "			ldsSortData[newOffset[2]] = sortData[2];\n"
 "			ldsSortData[newOffset[3]] = sortData[3];\n"
-"\n"
 "			ldsSortVal[newOffset[0]] = sortVal[0];\n"
 "			ldsSortVal[newOffset[1]] = sortVal[1];\n"
 "			ldsSortVal[newOffset[2]] = sortVal[2];\n"
 "			ldsSortVal[newOffset[3]] = sortVal[3];\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "			u32 dstAddr = 4*lIdx;\n"
 "			sortData[0] = ldsSortData[dstAddr+0];\n"
 "			sortData[1] = ldsSortData[dstAddr+1];\n"
 "			sortData[2] = ldsSortData[dstAddr+2];\n"
 "			sortData[3] = ldsSortData[dstAddr+3];\n"
-"\n"
 "			sortVal[0] = ldsSortVal[dstAddr+0];\n"
 "			sortVal[1] = ldsSortVal[dstAddr+1];\n"
 "			sortVal[2] = ldsSortVal[dstAddr+2];\n"
 "			sortVal[3] = ldsSortVal[dstAddr+3];\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void SortAndScatterSortDataKernel( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
@@ -802,39 +674,28 @@ static const char* radixSort32KernelsCL= \
 "	__local int ldsSortVal[WG_SIZE*ELEMENTS_PER_WORK_ITEM+16];\n"
 "	__local u32 localHistogramToCarry[NUM_BUCKET];\n"
 "	__local u32 localHistogram[NUM_BUCKET*2];\n"
-"\n"
 "	u32 gIdx = GET_GLOBAL_IDX;\n"
 "	u32 lIdx = GET_LOCAL_IDX;\n"
 "	u32 wgIdx = GET_GROUP_IDX;\n"
 "	u32 wgSize = GET_GROUP_SIZE;\n"
-"\n"
 "	const int n = cb.m_n;\n"
 "	const int nWGs = cb.m_nWGs;\n"
 "	const int startBit = cb.m_startBit;\n"
 "	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"\n"
 "	if( lIdx < (NUM_BUCKET) )\n"
 "	{\n"
 "		localHistogramToCarry[lIdx] = rHistogram[lIdx*nWGs + wgIdx];\n"
 "	}\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
 "    \n"
-"\n"
 "	const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
-"\n"
 "	int nBlocks = n/blockSize - nBlocksPerWG*wgIdx;\n"
-"\n"
 "	int addr = blockSize*nBlocksPerWG*wgIdx + ELEMENTS_PER_WORK_ITEM*lIdx;\n"
-"\n"
 "	for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++, addr+=blockSize)\n"
 "	{\n"
-"\n"
 "		u32 myHistogram = 0;\n"
-"\n"
 "		int sortData[ELEMENTS_PER_WORK_ITEM];\n"
 "		int sortVal[ELEMENTS_PER_WORK_ITEM];\n"
-"\n"
 "		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
 "#if defined(CHECK_BOUNDARY)\n"
 "		{\n"
@@ -847,13 +708,10 @@ static const char* radixSort32KernelsCL= \
 "			sortVal[i] = gSrc[ addr+i ].m_value;\n"
 "		}\n"
 "#endif\n"
-"\n"
 "		sort4Bits1KeyValue(sortData, sortVal, startBit, lIdx, ldsSortData, ldsSortVal);\n"
-"\n"
 "		u32 keys[ELEMENTS_PER_WORK_ITEM];\n"
 "		for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
 "			keys[i] = (sortData[i]>>startBit) & 0xf;\n"
-"\n"
 "		{	//	create histogram\n"
 "			u32 setIdx = lIdx/16;\n"
 "			if( lIdx < NUM_BUCKET )\n"
@@ -862,12 +720,10 @@ static const char* radixSort32KernelsCL= \
 "			}\n"
 "			ldsSortData[lIdx] = 0;\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "			for(int i=0; i<ELEMENTS_PER_WORK_ITEM; i++)\n"
 "#if defined(CHECK_BOUNDARY)\n"
 "				if( addr+i < n )\n"
 "#endif\n"
-"\n"
 "#if defined(NV_GPU)\n"
 "				SET_HISTOGRAM( setIdx, keys[i] )++;\n"
 "#else\n"
@@ -888,13 +744,11 @@ static const char* radixSort32KernelsCL= \
 "				localHistogram[hIdx] = sum;\n"
 "			}\n"
 "			GROUP_LDS_BARRIER;\n"
-"\n"
 "#if defined(USE_2LEVEL_REDUCE)\n"
 "			if( lIdx < NUM_BUCKET )\n"
 "			{\n"
 "				localHistogram[hIdx] = localHistogram[hIdx-1];\n"
 "				GROUP_MEM_FENCE;\n"
-"\n"
 "				u32 u0, u1, u2;\n"
 "				u0 = localHistogram[hIdx-3];\n"
 "				u1 = localHistogram[hIdx-2];\n"
@@ -924,7 +778,6 @@ static const char* radixSort32KernelsCL= \
 "#endif\n"
 "			GROUP_LDS_BARRIER;\n"
 "		}\n"
-"\n"
 "    	{\n"
 "			for(int ie=0; ie<ELEMENTS_PER_WORK_ITEM; ie++)\n"
 "			{\n"
@@ -958,9 +811,7 @@ static const char* radixSort32KernelsCL= \
 "#endif\n"
 "			}\n"
 "		}\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		if( lIdx < NUM_BUCKET )\n"
 "		{\n"
 "			localHistogramToCarry[lIdx] += myHistogram;\n"
@@ -968,13 +819,6 @@ static const char* radixSort32KernelsCL= \
 "		GROUP_LDS_BARRIER;\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void SortAndScatterSortDataKernelSerial( __global const SortDataCL* restrict gSrc, __global const u32* rHistogram, __global SortDataCL* restrict gDst, int4 cb)\n"
@@ -988,7 +832,6 @@ static const char* radixSort32KernelsCL= \
 "	const int n = cb.m_n;\n"
 "	const int nWGs = cb.m_nWGs;\n"
 "	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"\n"
 "    int counter[NUM_BUCKET];\n"
 "    \n"
 "    if (realLocalIdx>0)\n"
@@ -996,11 +839,9 @@ static const char* radixSort32KernelsCL= \
 "    \n"
 "    for (int c=0;c<NUM_BUCKET;c++)\n"
 "        counter[c]=0;\n"
-"\n"
 "    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
 "	\n"
 "	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"\n"
 "   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
 "  {\n"
 "     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
@@ -1022,8 +863,6 @@ static const char* radixSort32KernelsCL= \
 "  }\n"
 "    \n"
 "}\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void SortAndScatterKernelSerial( __global const u32* restrict gSrc, __global const u32* rHistogram, __global u32* restrict gDst, int4  cb )\n"
@@ -1037,7 +876,6 @@ static const char* radixSort32KernelsCL= \
 "	const int n = cb.m_n;\n"
 "	const int nWGs = cb.m_nWGs;\n"
 "	const int nBlocksPerWG = cb.m_nBlocksPerWG;\n"
-"\n"
 "    int counter[NUM_BUCKET];\n"
 "    \n"
 "    if (realLocalIdx>0)\n"
@@ -1045,11 +883,9 @@ static const char* radixSort32KernelsCL= \
 "    \n"
 "    for (int c=0;c<NUM_BUCKET;c++)\n"
 "        counter[c]=0;\n"
-"\n"
 "    const int blockSize = ELEMENTS_PER_WORK_ITEM*WG_SIZE;\n"
 "	\n"
 "	int nBlocks = (n)/blockSize - nBlocksPerWG*wgIdx;\n"
-"\n"
 "   for(int iblock=0; iblock<min(nBlocksPerWG, nBlocks); iblock++)\n"
 "  {\n"
 "     for (int lIdx=0;lIdx<WG_SIZE;lIdx++)\n"
--- a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
+++ b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
@@ -5,14 +5,11 @@ static const char* rayCastKernelCL= \
 "#define SHAPE_CONCAVE_TRIMESH 5\n"
 "#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
 "#define SHAPE_SPHERE 7\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_from;\n"
 "	float4 m_to;\n"
 "} b3RayInfo;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float m_hitFraction;\n"
@@ -22,20 +19,17 @@ static const char* rayCastKernelCL= \
 "	float4	m_hitPoint;\n"
 "	float4	m_hitNormal;\n"
 "} b3RayHit;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	unsigned int m_collidableIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct Collidable\n"
 "{\n"
 "	union {\n"
@@ -46,53 +40,37 @@ static const char* rayCastKernelCL= \
 "	int m_shapeType;\n"
 "	int m_shapeIndex;\n"
 "} Collidable;\n"
-"\n"
-"\n"
 "typedef struct  \n"
 "{\n"
 "	float4		m_localCenter;\n"
 "	float4		m_extents;\n"
 "	float4		mC;\n"
 "	float4		mE;\n"
-"\n"
 "	float			m_radius;\n"
 "	int	m_faceOffset;\n"
 "	int m_numFaces;\n"
 "	int	m_numVertices;\n"
-"\n"
 "	int m_vertexOffset;\n"
 "	int	m_uniqueEdgesOffset;\n"
 "	int	m_numUniqueEdges;\n"
 "	int m_unused;\n"
-"\n"
 "} ConvexPolyhedronCL;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_plane;\n"
 "	int m_indexOffset;\n"
 "	int m_numIndices;\n"
 "} b3GpuFace;\n"
-"\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "	Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "	Quaternion qtNormalize(Quaternion in);\n"
-"\n"
-"\n"
 "__inline\n"
 "	Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
 "__inline\n"
 "	float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -100,8 +78,6 @@ static const char* rayCastKernelCL= \
 "	float4 b1 = (float4)(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "	Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -112,7 +88,6 @@ static const char* rayCastKernelCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "	Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -130,39 +105,28 @@ static const char* rayCastKernelCL= \
 "	out = qtMul(out,qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "	Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "	float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
 "	float4* translationOut, Quaternion* orientationOut)\n"
 "{\n"
 "	*orientationOut = qtInvert(orientationIn);\n"
 "	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
 "	__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
 "{\n"
 "	rayFromLocal.w = 0.f;\n"
 "	rayToLocal.w = 0.f;\n"
 "	bool result = true;\n"
-"\n"
 "	float exitFraction = hitFraction[0];\n"
 "	float enterFraction = -0.3f;\n"
 "	float4 curHitNormal = (float4)(0,0,0,0);\n"
@@ -200,12 +164,10 @@ static const char* rayCastKernelCL= \
 "		if (exitFraction <= enterFraction)\n"
 "			result = false;\n"
 "	}\n"
-"\n"
 "	if (enterFraction < 0.f)\n"
 "	{\n"
 "		result = false;\n"
 "	}\n"
-"\n"
 "	if (result)\n"
 "	{	\n"
 "		hitFraction[0] = enterFraction;\n"
@@ -213,12 +175,6 @@ static const char* rayCastKernelCL= \
 "	}\n"
 "	return result;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "bool sphere_intersect(float4 spherePos,  float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
 "{\n"
 "	float4 rs = rayFrom - spherePos;\n"
@@ -228,13 +184,10 @@ static const char* rayCastKernelCL= \
 "	float A = dot(rayDir,rayDir);\n"
 "	float B = dot(rs, rayDir);\n"
 "	float C = dot(rs, rs) - (radius * radius);\n"
-"\n"
 "	float D = B * B - A*C;\n"
-"\n"
 "	if (D > 0.0f)\n"
 "	{\n"
 "		float t = (-B - sqrt(D))/A;\n"
-"\n"
 "		if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
 "		{\n"
 "			*hitFraction = t;\n"
@@ -243,7 +196,6 @@ static const char* rayCastKernelCL= \
 "	}\n"
 "	return false;\n"
 "}\n"
-"\n"
 "float4 setInterpolate3(float4 from, float4 to, float t)\n"
 "{\n"
 "	float s = 1.0f - t;\n"
@@ -252,7 +204,6 @@ static const char* rayCastKernelCL= \
 "	result.w = 0.f;	\n"
 "	return result;	\n"
 "}\n"
-"\n"
 "__kernel void rayCastKernel(  \n"
 "	int numRays, \n"
 "	const __global b3RayInfo* rays, \n"
@@ -263,23 +214,18 @@ static const char* rayCastKernelCL= \
 "	__global const b3GpuFace* faces,\n"
 "	__global const ConvexPolyhedronCL* convexShapes	)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numRays)\n"
 "		return;\n"
-"\n"
 "	hitResults[i].m_hitFraction = 1.f;\n"
-"\n"
 "	float4 rayFrom = rays[i].m_from;\n"
 "	float4 rayTo = rays[i].m_to;\n"
 "	float hitFraction = 1.f;\n"
 "	float4 hitPoint;\n"
 "	float4 hitNormal;\n"
 "	int hitBodyIndex= -1;\n"
-"\n"
 "	int cachedCollidableIndex = -1;\n"
 "	Collidable cachedCollidable;\n"
-"\n"
 "	for (int b=0;b<numBodies;b++)\n"
 "	{\n"
 "		if (hitResults[i].m_hitResult2==b)\n"
@@ -294,7 +240,6 @@ static const char* rayCastKernelCL= \
 "		}\n"
 "		if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
 "		{\n"
-"\n"
 "			float4 invPos = (float4)(0,0,0,0);\n"
 "			float4 invOrn = (float4)(0,0,0,0);\n"
 "			float4 rayFromLocal = (float4)(0,0,0,0);\n"
@@ -327,7 +272,6 @@ static const char* rayCastKernelCL= \
 "			}\n"
 "		}\n"
 "	}\n"
-"\n"
 "	if (hitBodyIndex>=0)\n"
 "	{\n"
 "		hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
@@ -336,6 +280,5 @@ static const char* rayCastKernelCL= \
 "		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
 "		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
 "	}\n"
-"\n"
 "}\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuBatchingPgsSolver.cpp
@@ -148,8 +148,8 @@ b3GpuBatchingPgsSolver::b3GpuBatchingPgsSolver(cl_context ctx,cl_device_id devic
 		cl_program solveFrictionProg= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solveFrictionSource, &pErrNum,additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
 		b3Assert(solveFrictionProg);

-		//cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
-		cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH,true);
+		cl_program solverSetup2Prog= b3OpenCLUtils::compileCLProgramFromString( ctx, device, solverSetup2Source, &pErrNum,additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
+		
 		
 		b3Assert(solverSetup2Prog);

@@ -886,7 +886,7 @@ void b3GpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
 					if (b3GpuBatchContacts)
 					{
 						B3_PROFILE("gpu batchContacts");
-						maxNumBatches = 50;//250;
+						maxNumBatches = 250;//250;
 						m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
 					} else
 					{
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
@@ -33,7 +33,7 @@ subject to the following restrictions:
 #define B3_RIGIDBODY_INTEGRATE_PATH "src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.cl"
 #define B3_RIGIDBODY_UPDATEAABB_PATH "src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl"

-bool useDbvt = false;
+bool useDbvt = false;//true;
 bool useBullet2CpuSolver = true;
 bool dumpContactStats = false;

--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl
@@ -13,6 +13,7 @@ subject to the following restrictions:
 */
 //Originally written by Takahiro Harada

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"

 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
@@ -64,22 +65,7 @@ typedef unsigned char u8;



-typedef struct 
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyA;//sign bit set for fixed objects
-	int m_bodyB;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-}Contact4;

 typedef struct 
 {
@@ -133,7 +119,7 @@ u32 tryWrite(__local u32* buff, int idx)
 }

 //	batching on the GPU
-__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut,
+__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,
 		__global const u32* gN, __global const u32* gStart, 
 		int m_staticIdx )
 {
@@ -186,8 +172,8 @@ __kernel void CreateBatches( __global const Contact4* gConstraints, __global Con
 							int dstIdx;
 							AtomInc1( ldsRingEnd, dstIdx );
 							
-							int a = gConstraints[m_start+srcIdx].m_bodyA;
-							int b = gConstraints[m_start+srcIdx].m_bodyB;
+							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;
+							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;
 							ldsRingElem[dstIdx].m_a = (a>b)? b:a;
 							ldsRingElem[dstIdx].m_b = (a>b)? a:b;
 							ldsRingElem[dstIdx].m_idx = srcIdx;
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
@@ -2,37 +2,71 @@
 static const char* batchingKernelsCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile __global int*\n"
 "#endif\n"
-"\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,43 +80,16 @@ static const char* batchingKernelsCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
-"\n"
-"\n"
-"typedef struct \n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyA;//sign bit set for fixed objects\n"
-"	int m_bodyB;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"}Contact4;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_n;\n"
@@ -90,24 +97,19 @@ static const char* batchingKernelsCL= \
 "	int m_staticIdx;\n"
 "	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_a;\n"
 "	int m_b;\n"
 "	u32 m_idx;\n"
 "}Elem;\n"
-"\n"
 "#define STACK_SIZE (WG_SIZE*10)\n"
 "//#define STACK_SIZE (WG_SIZE)\n"
 "#define RING_SIZE 1024\n"
 "#define RING_SIZE_MASK (RING_SIZE-1)\n"
 "#define CHECK_SIZE (WG_SIZE)\n"
-"\n"
-"\n"
 "#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
 "#define RING_END ldsTmp\n"
-"\n"
 "u32 readBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -115,7 +117,6 @@ static const char* batchingKernelsCL= \
 "	int bufIdx = idx/32;\n"
 "	return buff[bufIdx] & (1<<bitIdx);\n"
 "}\n"
-"\n"
 "void writeBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -124,7 +125,6 @@ static const char* batchingKernelsCL= \
 "//	buff[bufIdx] |= (1<<bitIdx);\n"
 "	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "}\n"
-"\n"
 "u32 tryWrite(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -133,9 +133,8 @@ static const char* batchingKernelsCL= \
 "	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "	return ((ans >> bitIdx)&1) == 0;\n"
 "}\n"
-"\n"
 "//	batching on the GPU\n"
-"__kernel void CreateBatches( __global const Contact4* gConstraints, __global Contact4* gConstraintsOut,\n"
+"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
 "		__global const u32* gN, __global const u32* gStart, \n"
 "		int m_staticIdx )\n"
 "{\n"
@@ -148,7 +147,6 @@ static const char* batchingKernelsCL= \
 "	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
 "	__local u32 ldsGEnd;\n"
 "	__local u32 ldsDstEnd;\n"
-"\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	\n"
@@ -168,7 +166,6 @@ static const char* batchingKernelsCL= \
 "	for(int ie=0; ie<50; ie++)\n"
 "	{\n"
 "		ldsFixedBuffer[lIdx] = 0;\n"
-"\n"
 "		for(int giter=0; giter<4; giter++)\n"
 "		{\n"
 "			int ringCap = GET_RING_CAPACITY;\n"
@@ -188,8 +185,8 @@ static const char* batchingKernelsCL= \
 "							int dstIdx;\n"
 "							AtomInc1( ldsRingEnd, dstIdx );\n"
 "							\n"
-"							int a = gConstraints[m_start+srcIdx].m_bodyA;\n"
-"							int b = gConstraints[m_start+srcIdx].m_bodyB;\n"
+"							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n"
+"							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n"
 "							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
 "							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
 "							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
@@ -198,37 +195,31 @@ static const char* batchingKernelsCL= \
 "					ringCap = GET_RING_CAPACITY;\n"
 "				}\n"
 "			}\n"
-"\n"
 "			GROUP_LDS_BARRIER;\n"
 "	\n"
 "			//	2. fill stack\n"
 "			__local Elem* dst = ldsRingElem;\n"
 "			if( lIdx == 0 ) RING_END = 0;\n"
-"\n"
 "			int srcIdx=lIdx;\n"
 "			int end = ldsRingEnd;\n"
-"\n"
 "			{\n"
 "				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
 "				{\n"
 "					Elem e;\n"
 "					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
 "					bool done = (srcIdx<end)?false:true;\n"
-"\n"
 "					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
 "					\n"
 "					if( !done )\n"
 "					{\n"
 "						int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
 "						int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
-"\n"
 "						if( aUsed==0 && bUsed==0 )\n"
 "						{\n"
 "							int aAvailable=1;\n"
 "							int bAvailable=1;\n"
 "							int ea = abs(e.m_a);\n"
 "							int eb = abs(e.m_b);\n"
-"\n"
 "							bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
 "							bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
 "							\n"
@@ -239,7 +230,6 @@ static const char* batchingKernelsCL= \
 "							\n"
 "							//aAvailable = aStatic? 1: aAvailable;\n"
 "							//bAvailable = bStatic? 1: bAvailable;\n"
-"\n"
 "							bool success = (aAvailable && bAvailable);\n"
 "							if(success)\n"
 "							{\n"
@@ -252,7 +242,6 @@ static const char* batchingKernelsCL= \
 "							done = success;\n"
 "						}\n"
 "					}\n"
-"\n"
 "					//	put it aside\n"
 "					if(srcIdx<end)\n"
 "					{\n"
@@ -272,7 +261,6 @@ static const char* batchingKernelsCL= \
 "							dst[dstIdx] = e;\n"
 "						}\n"
 "					}\n"
-"\n"
 "					//	if filled, flush\n"
 "					if( ldsStackEnd == STACK_SIZE )\n"
 "					{\n"
@@ -284,18 +272,14 @@ static const char* batchingKernelsCL= \
 "							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
 "						}\n"
 "						if( lIdx == 0 ) ldsStackEnd = 0;\n"
-"\n"
 "						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
 "						ldsFixedBuffer[lIdx] = 0;\n"
 "					}\n"
 "				}\n"
 "			}\n"
-"\n"
 "			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
 "		}\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
 "		{\n"
 "			int idx = m_start + ldsStackIdx[i];\n"
@@ -303,7 +287,6 @@ static const char* batchingKernelsCL= \
 "			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
 "			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
 "		}\n"
-"\n"
 "		//	in case it couldn't consume any pair. Flush them\n"
 "		//	todo. Serial batch worth while?\n"
 "		if( ldsStackEnd == 0 )\n"
@@ -318,38 +301,11 @@ static const char* batchingKernelsCL= \
 "			GROUP_LDS_BARRIER;\n"
 "			if( lIdx == 0 ) ldsRingEnd = 0;\n"
 "		}\n"
-"\n"
 "		if( lIdx == 0 ) ldsStackEnd = 0;\n"
-"\n"
 "		GROUP_LDS_BARRIER;\n"
-"\n"
 "		//	termination\n"
 "		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
 "			break;\n"
 "	}\n"
-"\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl
@@ -13,6 +13,7 @@ subject to the following restrictions:
 */
 //Originally written by Erwin Coumans

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"

 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
@@ -65,22 +66,7 @@ typedef unsigned char u8;



-typedef struct 
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;//sign bit set for fixed objects
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-}Contact4;

 typedef struct 
 {
@@ -102,7 +88,7 @@ typedef struct


 //	batching on the GPU
-__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )
+__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )
 {
 	int wgIdx = GET_GROUP_IDX;
 	int lIdx = GET_LOCAL_IDX;
@@ -155,13 +141,13 @@ u32 tryWrite(__local u32* buff, int idx)


 //	batching on the GPU
-__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
+__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
 {
 	int wgIdx = GET_GROUP_IDX;
 	int lIdx = GET_LOCAL_IDX;
 	const int numConstraints = gN[wgIdx];
 	const int m_start = gStart[wgIdx];
-		
+	b3Contact4Data_t tmp;
 	
 	__local u32 ldsFixedBuffer[CHECK_SIZE];
 		
@@ -173,7 +159,7 @@ __kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const
 	{
 	
 		
-		__global Contact4* cs = &gConstraints[m_start];	
+		__global struct b3Contact4Data* cs = &gConstraints[m_start];	
 	
 		
 		int numValidConstraints = 0;
@@ -214,11 +200,51 @@ __kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const

 					if (i!=numValidConstraints)
 					{
-						//btSwap(cs[i],cs[numValidConstraints]);

-						Contact4 tmp = cs[i];
-						cs[i] = cs[numValidConstraints];
-						cs[numValidConstraints] = tmp;
+//						tmp = cs[i];
+//						cs[i] = cs[numValidConstraints];
+//						cs[numValidConstraints]  = tmp;
+
+#ifdef CHECK_SIZE
+						tmp.m_worldPos[0] = cs[i].m_worldPos[0];
+						tmp.m_worldPos[1] = cs[i].m_worldPos[1];
+						tmp.m_worldPos[2] = cs[i].m_worldPos[2];
+						tmp.m_worldPos[3] = cs[i].m_worldPos[3];
+						tmp.m_worldNormal = cs[i].m_worldNormal;
+						tmp.m_restituitionCoeffCmp = cs[i].m_restituitionCoeffCmp;
+						tmp.m_frictionCoeffCmp = cs[i].m_frictionCoeffCmp;
+						tmp.m_batchIdx = cs[i].m_batchIdx;
+						tmp.m_bodyAPtrAndSignBit = cs[i].m_bodyAPtrAndSignBit;
+						tmp.m_bodyBPtrAndSignBit = cs[i].m_bodyBPtrAndSignBit;
+						tmp.m_childIndexA = cs[i].m_childIndexA;
+						tmp.m_childIndexB = cs[i].m_childIndexB;
+
+						cs[i].m_worldPos[0] = cs[numValidConstraints].m_worldPos[0];
+						cs[i].m_worldPos[1] = cs[numValidConstraints].m_worldPos[1];
+						cs[i].m_worldPos[2] = cs[numValidConstraints].m_worldPos[2];
+						cs[i].m_worldPos[3] = cs[numValidConstraints].m_worldPos[3];
+						cs[i].m_worldNormal = cs[numValidConstraints].m_worldNormal;
+						cs[i].m_restituitionCoeffCmp = cs[numValidConstraints].m_restituitionCoeffCmp;
+						cs[i].m_frictionCoeffCmp = cs[numValidConstraints].m_frictionCoeffCmp;
+						cs[i].m_batchIdx = cs[numValidConstraints].m_batchIdx;
+						cs[i].m_bodyAPtrAndSignBit = cs[numValidConstraints].m_bodyAPtrAndSignBit;
+						cs[i].m_bodyBPtrAndSignBit = cs[numValidConstraints].m_bodyBPtrAndSignBit;
+						cs[i].m_childIndexA = cs[numValidConstraints].m_childIndexA;
+						cs[i].m_childIndexB = cs[numValidConstraints].m_childIndexB;
+
+						cs[numValidConstraints].m_worldPos[0] = tmp.m_worldPos[0];
+						cs[numValidConstraints].m_worldPos[1] = tmp.m_worldPos[1];
+						cs[numValidConstraints].m_worldPos[2] = tmp.m_worldPos[2];
+						cs[numValidConstraints].m_worldPos[3] = tmp.m_worldPos[3];
+						cs[numValidConstraints].m_worldNormal = tmp.m_worldNormal;
+						cs[numValidConstraints].m_restituitionCoeffCmp = tmp.m_restituitionCoeffCmp;
+						cs[numValidConstraints].m_frictionCoeffCmp = tmp.m_frictionCoeffCmp;
+						cs[numValidConstraints].m_batchIdx = tmp.m_batchIdx;
+						cs[numValidConstraints].m_bodyAPtrAndSignBit = tmp.m_bodyAPtrAndSignBit;
+						cs[numValidConstraints].m_bodyBPtrAndSignBit = tmp.m_bodyBPtrAndSignBit;
+						cs[numValidConstraints].m_childIndexA = tmp.m_childIndexA;
+						cs[numValidConstraints].m_childIndexB = tmp.m_childIndexB;
+#endif

 					}

--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
@@ -2,38 +2,72 @@
 static const char* batchingKernelsNewCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile __global int*\n"
 "#endif\n"
-"\n"
 "#define SIMD_WIDTH 64\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -47,43 +81,16 @@ static const char* batchingKernelsNewCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
-"\n"
-"\n"
-"typedef struct \n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;//sign bit set for fixed objects\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"}Contact4;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_n;\n"
@@ -91,20 +98,14 @@ static const char* batchingKernelsNewCL= \
 "	int m_staticIdx;\n"
 "	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_a;\n"
 "	int m_b;\n"
 "	u32 m_idx;\n"
 "}Elem;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "//	batching on the GPU\n"
-"__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
+"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
 "{\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
@@ -122,13 +123,7 @@ static const char* batchingKernelsNewCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "#define CHECK_SIZE (WG_SIZE)\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "u32 readBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -136,7 +131,6 @@ static const char* batchingKernelsNewCL= \
 "	int bufIdx = idx/32;\n"
 "	return buff[bufIdx] & (1<<bitIdx);\n"
 "}\n"
-"\n"
 "void writeBuf(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -145,7 +139,6 @@ static const char* batchingKernelsNewCL= \
 "	buff[bufIdx] |= (1<<bitIdx);\n"
 "	//atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "}\n"
-"\n"
 "u32 tryWrite(__local u32* buff, int idx)\n"
 "{\n"
 "	idx = idx % (32*CHECK_SIZE);\n"
@@ -154,16 +147,14 @@ static const char* batchingKernelsNewCL= \
 "	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
 "	return ((ans >> bitIdx)&1) == 0;\n"
 "}\n"
-"\n"
-"\n"
 "//	batching on the GPU\n"
-"__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
+"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
 "{\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	const int numConstraints = gN[wgIdx];\n"
 "	const int m_start = gStart[wgIdx];\n"
-"		\n"
+"	b3Contact4Data_t tmp;\n"
 "	\n"
 "	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
 "		\n"
@@ -175,12 +166,11 @@ static const char* batchingKernelsNewCL= \
 "	{\n"
 "	\n"
 "		\n"
-"		__global Contact4* cs = &gConstraints[m_start];	\n"
+"		__global struct b3Contact4Data* cs = &gConstraints[m_start];	\n"
 "	\n"
 "		\n"
 "		int numValidConstraints = 0;\n"
 "		int batchIdx = 0;\n"
-"\n"
 "		while( numValidConstraints < numConstraints)\n"
 "		{\n"
 "			int nCurrentBatch = 0;\n"
@@ -188,10 +178,8 @@ static const char* batchingKernelsNewCL= \
 "	\n"
 "			for(int i=0; i<CHECK_SIZE; i++) \n"
 "				ldsFixedBuffer[i] = 0;		\n"
-"\n"
 "			for(int i=numValidConstraints; i<numConstraints; i++)\n"
 "			{\n"
-"\n"
 "				int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
 "				int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
 "				int bodyA = abs(bodyAS);\n"
@@ -211,19 +199,51 @@ static const char* batchingKernelsNewCL= \
 "					{\n"
 "						writeBuf( ldsFixedBuffer, bodyB );\n"
 "					}\n"
-"\n"
 "					cs[i].m_batchIdx = batchIdx;\n"
-"\n"
 "					if (i!=numValidConstraints)\n"
 "					{\n"
-"						//btSwap(cs[i],cs[numValidConstraints]);\n"
-"						\n"
-"						Contact4 tmp = cs[i];\n"
-"						cs[i] = cs[numValidConstraints];\n"
-"						cs[numValidConstraints] = tmp;\n"
-"						\n"
+"//						tmp = cs[i];\n"
+"//						cs[i] = cs[numValidConstraints];\n"
+"//						cs[numValidConstraints]  = tmp;\n"
+"#ifdef CHECK_SIZE\n"
+"						tmp.m_worldPos[0] = cs[i].m_worldPos[0];\n"
+"						tmp.m_worldPos[1] = cs[i].m_worldPos[1];\n"
+"						tmp.m_worldPos[2] = cs[i].m_worldPos[2];\n"
+"						tmp.m_worldPos[3] = cs[i].m_worldPos[3];\n"
+"						tmp.m_worldNormal = cs[i].m_worldNormal;\n"
+"						tmp.m_restituitionCoeffCmp = cs[i].m_restituitionCoeffCmp;\n"
+"						tmp.m_frictionCoeffCmp = cs[i].m_frictionCoeffCmp;\n"
+"						tmp.m_batchIdx = cs[i].m_batchIdx;\n"
+"						tmp.m_bodyAPtrAndSignBit = cs[i].m_bodyAPtrAndSignBit;\n"
+"						tmp.m_bodyBPtrAndSignBit = cs[i].m_bodyBPtrAndSignBit;\n"
+"						tmp.m_childIndexA = cs[i].m_childIndexA;\n"
+"						tmp.m_childIndexB = cs[i].m_childIndexB;\n"
+"						cs[i].m_worldPos[0] = cs[numValidConstraints].m_worldPos[0];\n"
+"						cs[i].m_worldPos[1] = cs[numValidConstraints].m_worldPos[1];\n"
+"						cs[i].m_worldPos[2] = cs[numValidConstraints].m_worldPos[2];\n"
+"						cs[i].m_worldPos[3] = cs[numValidConstraints].m_worldPos[3];\n"
+"						cs[i].m_worldNormal = cs[numValidConstraints].m_worldNormal;\n"
+"						cs[i].m_restituitionCoeffCmp = cs[numValidConstraints].m_restituitionCoeffCmp;\n"
+"						cs[i].m_frictionCoeffCmp = cs[numValidConstraints].m_frictionCoeffCmp;\n"
+"						cs[i].m_batchIdx = cs[numValidConstraints].m_batchIdx;\n"
+"						cs[i].m_bodyAPtrAndSignBit = cs[numValidConstraints].m_bodyAPtrAndSignBit;\n"
+"						cs[i].m_bodyBPtrAndSignBit = cs[numValidConstraints].m_bodyBPtrAndSignBit;\n"
+"						cs[i].m_childIndexA = cs[numValidConstraints].m_childIndexA;\n"
+"						cs[i].m_childIndexB = cs[numValidConstraints].m_childIndexB;\n"
+"						cs[numValidConstraints].m_worldPos[0] = tmp.m_worldPos[0];\n"
+"						cs[numValidConstraints].m_worldPos[1] = tmp.m_worldPos[1];\n"
+"						cs[numValidConstraints].m_worldPos[2] = tmp.m_worldPos[2];\n"
+"						cs[numValidConstraints].m_worldPos[3] = tmp.m_worldPos[3];\n"
+"						cs[numValidConstraints].m_worldNormal = tmp.m_worldNormal;\n"
+"						cs[numValidConstraints].m_restituitionCoeffCmp = tmp.m_restituitionCoeffCmp;\n"
+"						cs[numValidConstraints].m_frictionCoeffCmp = tmp.m_frictionCoeffCmp;\n"
+"						cs[numValidConstraints].m_batchIdx = tmp.m_batchIdx;\n"
+"						cs[numValidConstraints].m_bodyAPtrAndSignBit = tmp.m_bodyAPtrAndSignBit;\n"
+"						cs[numValidConstraints].m_bodyBPtrAndSignBit = tmp.m_bodyBPtrAndSignBit;\n"
+"						cs[numValidConstraints].m_childIndexA = tmp.m_childIndexA;\n"
+"						cs[numValidConstraints].m_childIndexB = tmp.m_childIndexB;\n"
+"#endif\n"
 "					}\n"
-"\n"
 "					numValidConstraints++;\n"
 "					\n"
 "					nCurrentBatch++;\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
@@ -2,19 +2,16 @@
 static const char* integrateKernelCL= \
 "/*\n"
 "Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
 "float4 quatMult(float4 q1, float4 q2)\n"
 "{\n"
 "	float4 q;\n"
@@ -24,7 +21,6 @@ static const char* integrateKernelCL= \
 "	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; \n"
 "	return q;\n"
 "}\n"
-"\n"
 "float4 quatNorm(float4 q)\n"
 "{\n"
 "	float len = native_sqrt(dot(q, q));\n"
@@ -39,24 +35,17 @@ static const char* integrateKernelCL= \
 "	}\n"
 "	return q;\n"
 "}\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	unsigned int m_collidableIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel void \n"
 "  integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
 "{\n"
@@ -92,12 +81,10 @@ static const char* integrateKernelCL= \
 "			float4 dorn = axis;\n"
 "			dorn.w = native_cos(fAngle * timeStep * 0.5f);\n"
 "			float4 orn0 = bodies[nodeID].m_quat;\n"
-"\n"
 "			float4 predictedOrn = quatMult(dorn, orn0);\n"
 "			predictedOrn = quatNorm(predictedOrn);\n"
 "			bodies[nodeID].m_quat=predictedOrn;\n"
 "		}\n"
-"\n"
 "		//linear velocity		\n"
 "		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;\n"
 "		\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
@@ -2,56 +2,37 @@
 static const char* solveConstraintRowsCL= \
 "/*\n"
 "Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
 "#define B3_CONSTRAINT_FLAG_ENABLED 1\n"
-"\n"
 "#define B3_GPU_POINT2POINT_CONSTRAINT_TYPE 3\n"
 "#define B3_GPU_FIXED_CONSTRAINT_TYPE 4\n"
-"\n"
 "#define MOTIONCLAMP 100000 //unused, for debugging/safety in case constraint solver fails\n"
 "#define B3_INFINITY 1e30f\n"
-"\n"
 "#define mymake_float4 (float4)\n"
-"\n"
-"\n"
 "__inline float dot3F4(float4 a, float4 b)\n"
 "{\n"
 "	float4 a1 = mymake_float4(a.xyz,0.f);\n"
 "	float4 b1 = mymake_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -62,36 +43,28 @@ static const char* solveConstraintRowsCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertiaWorld;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} BodyInertia;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_basis;//orientation\n"
 "	float4	m_origin;//transform\n"
 "}b3Transform;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "//	b3Transform		m_worldTransformUnused;\n"
@@ -104,38 +77,30 @@ static const char* solveConstraintRowsCL= \
 "	float4		m_turnVelocity;\n"
 "	float4		m_linearVelocity;\n"
 "	float4		m_angularVelocity;\n"
-"\n"
 "	union \n"
 "	{\n"
 "		void*	m_originalBody;\n"
 "		int		m_originalBodyIndex;\n"
 "	};\n"
 "	int padding[3];\n"
-"\n"
 "} b3GpuSolverBody;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	unsigned int m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} b3RigidBodyCL;\n"
-"\n"
 "typedef struct\n"
 "{\n"
-"\n"
 "	float4		m_relpos1CrossNormal;\n"
 "	float4		m_contactNormal;\n"
-"\n"
 "	float4		m_relpos2CrossNormal;\n"
 "	//float4		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal\n"
-"\n"
 "	float4		m_angularComponentA;\n"
 "	float4		m_angularComponentB;\n"
 "	\n"
@@ -152,15 +117,11 @@ static const char* solveConstraintRowsCL= \
 "	float		m_upperLimit;\n"
 "	float		m_rhsPenetration;\n"
 "	int			m_originalConstraint;\n"
-"\n"
-"\n"
 "	int	m_overrideNumSolverIterations;\n"
 "    int			m_frictionIndex;\n"
 "	int m_solverBodyIdA;\n"
 "	int m_solverBodyIdB;\n"
-"\n"
 "} b3SolverConstraint;\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_bodyAPtrAndSignBit;\n"
@@ -168,28 +129,18 @@ static const char* solveConstraintRowsCL= \
 "	int m_originalConstraintIndex;\n"
 "	int m_batchId;\n"
 "} b3BatchConstraint;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int				m_constraintType;\n"
 "	int				m_rbA;\n"
 "	int				m_rbB;\n"
 "	float			m_breakingImpulseThreshold;\n"
-"\n"
 "	float4 m_pivotInA;\n"
 "	float4 m_pivotInB;\n"
 "	Quaternion m_relTargetAB;\n"
-"\n"
 "	int	m_flags;\n"
 "	int m_padding[3];\n"
 "} b3GpuGenericConstraint;\n"
-"\n"
-"\n"
 "/*b3Transform	getWorldTransform(b3RigidBodyCL* rb)\n"
 "{\n"
 "	b3Transform newTrans;\n"
@@ -197,39 +148,25 @@ static const char* solveConstraintRowsCL= \
 "	newTrans.setRotation(rb->m_quat);\n"
 "	return newTrans;\n"
 "}*/\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	v = mymake_float4(v.xyz,0.f);\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -240,7 +177,6 @@ static const char* solveConstraintRowsCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -257,30 +193,23 @@ static const char* solveConstraintRowsCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
-"\n"
 "__inline void internalApplyImpulse(__global b3GpuSolverBody* body,  float4 linearComponent, float4 angularComponent,float impulseMagnitude)\n"
 "{\n"
 "	body->m_deltaLinearVelocity += linearComponent*impulseMagnitude*body->m_linearFactor;\n"
 "	body->m_deltaAngularVelocity += angularComponent*(impulseMagnitude*body->m_angularFactor);\n"
 "}\n"
-"\n"
-"\n"
 "void resolveSingleConstraintRowGeneric(__global b3GpuSolverBody* body1, __global b3GpuSolverBody* body2, __global b3SolverConstraint* c)\n"
 "{\n"
 "	float deltaImpulse = c->m_rhs-c->m_appliedImpulse*c->m_cfm;\n"
 "	float deltaVel1Dotn	=	dot3F4(c->m_contactNormal,body1->m_deltaLinearVelocity) 	+ dot3F4(c->m_relpos1CrossNormal,body1->m_deltaAngularVelocity);\n"
 "	float deltaVel2Dotn	=	-dot3F4(c->m_contactNormal,body2->m_deltaLinearVelocity) + dot3F4(c->m_relpos2CrossNormal,body2->m_deltaAngularVelocity);\n"
-"\n"
 "	deltaImpulse	-=	deltaVel1Dotn*c->m_jacDiagABInv;\n"
 "	deltaImpulse	-=	deltaVel2Dotn*c->m_jacDiagABInv;\n"
-"\n"
 "	float sum = c->m_appliedImpulse + deltaImpulse;\n"
 "	if (sum < c->m_lowerLimit)\n"
 "	{\n"
@@ -296,12 +225,9 @@ static const char* solveConstraintRowsCL= \
 "	{\n"
 "		c->m_appliedImpulse = sum;\n"
 "	}\n"
-"\n"
 "	internalApplyImpulse(body1,c->m_contactNormal*body1->m_invMass,c->m_angularComponentA,deltaImpulse);\n"
 "	internalApplyImpulse(body2,-c->m_contactNormal*body2->m_invMass,c->m_angularComponentB,deltaImpulse);\n"
-"\n"
 "}\n"
-"\n"
 "__kernel void solveJointConstraintRows(__global b3GpuSolverBody* solverBodies,\n"
 "					  __global b3BatchConstraint* batchConstraints,\n"
 "					  	__global b3SolverConstraint* rows,\n"
@@ -315,7 +241,6 @@ static const char* solveConstraintRowsCL= \
 "	int b = get_global_id(0);\n"
 "	if (b>=numConstraintsInBatch)\n"
 "		return;\n"
-"\n"
 "	__global b3BatchConstraint* c = &batchConstraints[b+batchOffset];\n"
 "	int originalConstraintIndex = c->m_originalConstraintIndex;\n"
 "	if (constraints[originalConstraintIndex].m_flags&B3_CONSTRAINT_FLAG_ENABLED)\n"
@@ -329,16 +254,13 @@ static const char* solveConstraintRowsCL= \
 "		}\n"
 "	}\n"
 "};\n"
-"\n"
 "__kernel void initSolverBodies(__global b3GpuSolverBody* solverBodies,__global b3RigidBodyCL* bodiesCL, int numBodies)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numBodies)\n"
 "		return;\n"
-"\n"
 "	__global b3GpuSolverBody* solverBody = &solverBodies[i];\n"
 "	__global b3RigidBodyCL* bodyCL = &bodiesCL[i];\n"
-"\n"
 "	solverBody->m_deltaLinearVelocity = (float4)(0.f,0.f,0.f,0.f);\n"
 "	solverBody->m_deltaAngularVelocity  = (float4)(0.f,0.f,0.f,0.f);\n"
 "	solverBody->m_pushVelocity = (float4)(0.f,0.f,0.f,0.f);\n"
@@ -350,7 +272,6 @@ static const char* solveConstraintRowsCL= \
 "	solverBody->m_linearVelocity = bodyCL->m_linVel;\n"
 "	solverBody->m_angularVelocity = bodyCL->m_angVel;\n"
 "}\n"
-"\n"
 "__kernel void breakViolatedConstraintsKernel(__global b3GpuGenericConstraint* constraints, __global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, __global b3SolverConstraint* rows, int numConstraints)\n"
 "{\n"
 "	int cid = get_global_id(0);\n"
@@ -370,17 +291,12 @@ static const char* solveConstraintRowsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__kernel void getInfo1Kernel(__global unsigned int* infos, __global b3GpuGenericConstraint* constraints, int numConstraints)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numConstraints)\n"
 "		return;\n"
-"\n"
 "	__global b3GpuGenericConstraint* constraint = &constraints[i];\n"
-"\n"
 "	switch (constraint->m_constraintType)\n"
 "	{\n"
 "		case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n"
@@ -398,7 +314,6 @@ static const char* solveConstraintRowsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel void initBatchConstraintsKernel(__global unsigned int* numConstraintRows, __global unsigned int* rowOffsets, \n"
 "										__global b3BatchConstraint* batchConstraints, \n"
 "										__global b3GpuGenericConstraint* constraints,\n"
@@ -408,26 +323,18 @@ static const char* solveConstraintRowsCL= \
 "	int i = get_global_id(0);\n"
 "	if (i>=numConstraints)\n"
 "		return;\n"
-"\n"
 "	int rbA = constraints[i].m_rbA;\n"
 "	int rbB = constraints[i].m_rbB;\n"
-"\n"
 "	batchConstraints[i].m_bodyAPtrAndSignBit = bodies[rbA].m_invMass? rbA : -rbA;\n"
 "	batchConstraints[i].m_bodyBPtrAndSignBit = bodies[rbB].m_invMass? rbB : -rbB;\n"
 "	batchConstraints[i].m_batchId = -1;\n"
 "	batchConstraints[i].m_originalConstraintIndex = i;\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	// integrator parameters: frames per second (1/stepsize), default error\n"
 "	// reduction parameter (0..1).\n"
 "	float fps,erp;\n"
-"\n"
 "	// for the first and second body, pointers to two (linear and angular)\n"
 "	// n*3 jacobian sub matrices, stored by rows. these matrices will have\n"
 "	// been initialized to 0 on entry. if the second body is zero then the\n"
@@ -441,7 +348,6 @@ static const char* solveConstraintRowsCL= \
 "	{\n"
 "		__global float4* m_J1angularAxisFloat4;\n"
 "		__global float* m_J1angularAxis;\n"
-"\n"
 "	};\n"
 "	union\n"
 "	{\n"
@@ -455,17 +361,14 @@ static const char* solveConstraintRowsCL= \
 "	};\n"
 "	// elements to jump from one row to the next in J's\n"
 "	int rowskip;\n"
-"\n"
 "	// right hand sides of the equation J*v = c + cfm * lambda. cfm is the\n"
 "	// \"constraint force mixing\" vector. c is set to zero on entry, cfm is\n"
 "	// set to a constant value (typically very small or zero) value on entry.\n"
 "	__global float* m_constraintError;\n"
 "	__global float* cfm;\n"
-"\n"
 "	// lo and hi limits for variables (set to -/+ infinity on entry).\n"
 "	__global float* m_lowerLimit;\n"
 "	__global float* m_upperLimit;\n"
-"\n"
 "	// findex vector for variables. see the LCP solver interface for a\n"
 "	// description of what this does. this is set to -1 on entry.\n"
 "	// note that the returned indexes are relative to the first index of\n"
@@ -473,39 +376,28 @@ static const char* solveConstraintRowsCL= \
 "	__global int *findex;\n"
 "	// number of solver iterations\n"
 "	int m_numIterations;\n"
-"\n"
 "	//damping of the velocity\n"
 "	float	m_damping;\n"
 "} b3GpuConstraintInfo2;\n"
-"\n"
-"\n"
 "void	getSkewSymmetricMatrix(float4 vecIn, __global float4* v0,__global float4* v1,__global float4* v2)\n"
 "{\n"
 "	*v0 = (float4)(0.		,-vecIn.z		,vecIn.y,0.f);\n"
 "	*v1 = (float4)(vecIn.z	,0.			,-vecIn.x,0.f);\n"
 "	*v2 = (float4)(-vecIn.y	,vecIn.x	,0.f,0.f);\n"
 "}\n"
-"\n"
-"\n"
 "void getInfo2Point2Point(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies)\n"
 "{\n"
 "	float4 posA = bodies[constraint->m_rbA].m_pos;\n"
 "	Quaternion rotA = bodies[constraint->m_rbA].m_quat;\n"
-"\n"
 "	float4 posB = bodies[constraint->m_rbB].m_pos;\n"
 "	Quaternion rotB = bodies[constraint->m_rbB].m_quat;\n"
-"\n"
-"\n"
-"\n"
 "		// anchor points in global coordinates with respect to body PORs.\n"
 "   \n"
 "    // set jacobian\n"
 "    info->m_J1linearAxis[0] = 1;\n"
 "	info->m_J1linearAxis[info->rowskip+1] = 1;\n"
 "	info->m_J1linearAxis[2*info->rowskip+2] = 1;\n"
-"\n"
 "	float4 a1 = qtRotate(rotA,constraint->m_pivotInA);\n"
-"\n"
 "	{\n"
 "		__global float4* angular0 = (__global float4*)(info->m_J1angularAxis);\n"
 "		__global float4* angular1 = (__global float4*)(info->m_J1angularAxis+info->rowskip);\n"
@@ -533,18 +425,15 @@ static const char* solveConstraintRowsCL= \
 "    // set right hand side\n"
 "//	float currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;\n"
 "	float currERP = info->erp;\n"
-"\n"
 "	float k = info->fps * currERP;\n"
 "    int j;\n"
 "	float4 result = a2 + posB - a1 - posA;\n"
 "	float* resultPtr = &result;\n"
-"\n"
 "	for (j=0; j<3; j++)\n"
 "    {\n"
 "        info->m_constraintError[j*info->rowskip] = k * (resultPtr[j]);\n"
 "    }\n"
 "}\n"
-"\n"
 "Quaternion nearest( Quaternion first, Quaternion qd)\n"
 "{\n"
 "	Quaternion diff,sum;\n"
@@ -555,7 +444,6 @@ static const char* solveConstraintRowsCL= \
 "		return qd;\n"
 "	return (-qd);\n"
 "}\n"
-"\n"
 "float b3Acos(float x) \n"
 "{ \n"
 "	if (x<-1)	\n"
@@ -564,7 +452,6 @@ static const char* solveConstraintRowsCL= \
 "		x=1;\n"
 "	return acos(x); \n"
 "}\n"
-"\n"
 "float getAngle(Quaternion orn)\n"
 "{\n"
 "	if (orn.w>=1.f)\n"
@@ -572,7 +459,6 @@ static const char* solveConstraintRowsCL= \
 "	float s = 2.f * b3Acos(orn.w);\n"
 "	return s;\n"
 "}\n"
-"\n"
 "void calculateDiffAxisAngleQuaternion( Quaternion orn0,Quaternion orn1a,float4* axis,float* angle)\n"
 "{\n"
 "	Quaternion orn1 = nearest(orn0,orn1a);\n"
@@ -588,17 +474,12 @@ static const char* solveConstraintRowsCL= \
 "	else\n"
 "		*axis /= sqrt(len);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "void getInfo2FixedOrientation(__global b3GpuGenericConstraint* constraint,b3GpuConstraintInfo2* info,__global b3RigidBodyCL* bodies, int start_row)\n"
 "{\n"
 "	Quaternion worldOrnA = bodies[constraint->m_rbA].m_quat;\n"
 "	Quaternion worldOrnB = bodies[constraint->m_rbB].m_quat;\n"
-"\n"
 "	int s = info->rowskip;\n"
 "	int start_index = start_row * s;\n"
-"\n"
 "	// 3 rows to make body rotations equal\n"
 "	info->m_J1angularAxis[start_index] = 1;\n"
 "	info->m_J1angularAxis[start_index + s + 1] = 1;\n"
@@ -626,16 +507,12 @@ static const char* solveConstraintRowsCL= \
 "        info->m_constraintError[(3+j)*info->rowskip] = k * resultPtr[j];\n"
 "    }\n"
 "	\n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void writeBackVelocitiesKernel(__global b3RigidBodyCL* bodies,__global b3GpuSolverBody* solverBodies,int numBodies)\n"
 "{\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numBodies)\n"
 "		return;\n"
-"\n"
 "	if (bodies[i].m_invMass)\n"
 "	{\n"
 "//		if (length(solverBodies[i].m_deltaLinearVelocity)<MOTIONCLAMP)\n"
@@ -648,8 +525,6 @@ static const char* solveConstraintRowsCL= \
 "		} \n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void getInfo2Kernel(__global b3SolverConstraint* solverConstraintRows, \n"
 "							__global unsigned int* infos, \n"
 "							__global unsigned int* constraintRowOffsets, \n"
@@ -665,7 +540,6 @@ static const char* solveConstraintRowsCL= \
 "							int globalNumIterations,\n"
 "							int numConstraints)\n"
 "{\n"
-"\n"
 "	int i = get_global_id(0);\n"
 "	if (i>=numConstraints)\n"
 "		return;\n"
@@ -675,17 +549,12 @@ static const char* solveConstraintRowsCL= \
 "			\n"
 "	__global b3SolverConstraint* currentConstraintRow = &solverConstraintRows[constraintRowOffsets[i]];\n"
 "	__global b3GpuGenericConstraint* constraint = &constraints[i];\n"
-"\n"
 "	__global b3RigidBodyCL* rbA = &bodies[ constraint->m_rbA];\n"
 "	__global b3RigidBodyCL* rbB = &bodies[ constraint->m_rbB];\n"
-"\n"
 "	int solverBodyIdA = constraint->m_rbA;\n"
 "	int solverBodyIdB = constraint->m_rbB;\n"
-"\n"
 "	__global b3GpuSolverBody* bodyAPtr = &solverBodies[solverBodyIdA];\n"
 "	__global b3GpuSolverBody* bodyBPtr = &solverBodies[solverBodyIdB];\n"
-"\n"
-"\n"
 "	if (rbA->m_invMass)\n"
 "	{\n"
 "		batchConstraints[i].m_bodyAPtrAndSignBit = solverBodyIdA;\n"
@@ -695,7 +564,6 @@ static const char* solveConstraintRowsCL= \
 "//				m_staticIdx = 0;\n"
 "		batchConstraints[i].m_bodyAPtrAndSignBit = -solverBodyIdA;\n"
 "	}\n"
-"\n"
 "	if (rbB->m_invMass)\n"
 "	{\n"
 "		batchConstraints[i].m_bodyBPtrAndSignBit = solverBodyIdB;\n"
@@ -705,14 +573,11 @@ static const char* solveConstraintRowsCL= \
 "//				m_staticIdx = 0;\n"
 "		batchConstraints[i].m_bodyBPtrAndSignBit = -solverBodyIdB;\n"
 "	}\n"
-"\n"
 "	if (info1)\n"
 "	{\n"
 "		int overrideNumSolverIterations = 0;//constraint->getOverrideNumSolverIterations() > 0 ? constraint->getOverrideNumSolverIterations() : infoGlobal.m_numIterations;\n"
 "//		if (overrideNumSolverIterations>m_maxOverrideNumSolverIterations)\n"
 "	//		m_maxOverrideNumSolverIterations = overrideNumSolverIterations;\n"
-"\n"
-"\n"
 "		int j;\n"
 "		for ( j=0;j<info1;j++)\n"
 "		{\n"
@@ -728,7 +593,6 @@ static const char* solveConstraintRowsCL= \
 "			currentConstraintRow[j].m_jacDiagABInv = 0.f;\n"
 "			currentConstraintRow[j].m_lowerLimit = 0.f;\n"
 "			currentConstraintRow[j].m_upperLimit = 0.f;\n"
-"\n"
 "			currentConstraintRow[j].m_originalConstraint = i;\n"
 "			currentConstraintRow[j].m_overrideNumSolverIterations = 0;\n"
 "			currentConstraintRow[j].m_relpos1CrossNormal = (float4)(0,0,0,0);\n"
@@ -746,7 +610,6 @@ static const char* solveConstraintRowsCL= \
 "			currentConstraintRow[j].m_solverBodyIdB = solverBodyIdB;\n"
 "			currentConstraintRow[j].m_overrideNumSolverIterations = overrideNumSolverIterations;		\n"
 "		}\n"
-"\n"
 "		bodyAPtr->m_deltaLinearVelocity = (float4)(0,0,0,0);\n"
 "		bodyAPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n"
 "		bodyAPtr->m_pushVelocity = (float4)(0,0,0,0);\n"
@@ -755,12 +618,8 @@ static const char* solveConstraintRowsCL= \
 "		bodyBPtr->m_deltaAngularVelocity = (float4)(0,0,0,0);\n"
 "		bodyBPtr->m_pushVelocity = (float4)(0,0,0,0);\n"
 "		bodyBPtr->m_turnVelocity  = (float4)(0,0,0,0);\n"
-"\n"
 "		int rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n"
 "		\n"
-"		\n"
-"\n"
-"\n"
 "		b3GpuConstraintInfo2 info2;\n"
 "		info2.fps = 1.f/timeStep;\n"
 "		info2.erp = globalErp;\n"
@@ -769,7 +628,6 @@ static const char* solveConstraintRowsCL= \
 "		info2.m_J2linearAxisFloat4 = 0;\n"
 "		info2.m_J2angularAxisFloat4 = &currentConstraintRow->m_relpos2CrossNormal;\n"
 "		info2.rowskip = sizeof(b3SolverConstraint)/sizeof(float);//check this\n"
-"\n"
 "		///the size of b3SolverConstraint needs be a multiple of float\n"
 "//		b3Assert(info2.rowskip*sizeof(float)== sizeof(b3SolverConstraint));\n"
 "		info2.m_constraintError = &currentConstraintRow->m_rhs;\n"
@@ -779,7 +637,6 @@ static const char* solveConstraintRowsCL= \
 "		info2.m_lowerLimit = &currentConstraintRow->m_lowerLimit;\n"
 "		info2.m_upperLimit = &currentConstraintRow->m_upperLimit;\n"
 "		info2.m_numIterations = globalNumIterations;\n"
-"\n"
 "		switch (constraint->m_constraintType)\n"
 "		{\n"
 "			case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:\n"
@@ -790,37 +647,29 @@ static const char* solveConstraintRowsCL= \
 "			case B3_GPU_FIXED_CONSTRAINT_TYPE:\n"
 "			{\n"
 "				getInfo2Point2Point(constraint,&info2,bodies);\n"
-"\n"
 "				getInfo2FixedOrientation(constraint,&info2,bodies,3);\n"
-"\n"
 "				break;\n"
 "			}\n"
-"\n"
 "			default:\n"
 "			{\n"
 "			}\n"
 "		}\n"
-"\n"
 "		///finalize the constraint setup\n"
 "		for ( j=0;j<info1;j++)\n"
 "		{\n"
 "			__global b3SolverConstraint* solverConstraint = &currentConstraintRow[j];\n"
-"\n"
 "			if (solverConstraint->m_upperLimit>=constraint->m_breakingImpulseThreshold)\n"
 "			{\n"
 "				solverConstraint->m_upperLimit = constraint->m_breakingImpulseThreshold;\n"
 "			}\n"
-"\n"
 "			if (solverConstraint->m_lowerLimit<=-constraint->m_breakingImpulseThreshold)\n"
 "			{\n"
 "				solverConstraint->m_lowerLimit = -constraint->m_breakingImpulseThreshold;\n"
 "			}\n"
-"\n"
 "//						solverConstraint->m_originalContactPoint = constraint;\n"
 "							\n"
 "			Matrix3x3 invInertiaWorldA= inertias[constraint->m_rbA].m_invInertiaWorld;\n"
 "			{\n"
-"\n"
 "				//float4 angularFactorA(1,1,1);\n"
 "				float4 ftorqueAxis1 = solverConstraint->m_relpos1CrossNormal;\n"
 "				solverConstraint->m_angularComponentA = mtMul1(invInertiaWorldA,ftorqueAxis1);//*angularFactorA;\n"
@@ -828,11 +677,9 @@ static const char* solveConstraintRowsCL= \
 "						\n"
 "			Matrix3x3 invInertiaWorldB= inertias[constraint->m_rbB].m_invInertiaWorld;\n"
 "			{\n"
-"\n"
 "				float4 ftorqueAxis2 = solverConstraint->m_relpos2CrossNormal;\n"
 "				solverConstraint->m_angularComponentB = mtMul1(invInertiaWorldB,ftorqueAxis2);//*constraint->m_rbB.getAngularFactor();\n"
 "			}\n"
-"\n"
 "			{\n"
 "				//it is ok to use solverConstraint->m_contactNormal instead of -solverConstraint->m_contactNormal\n"
 "				//because it gets multiplied iMJlB\n"
@@ -840,7 +687,6 @@ static const char* solveConstraintRowsCL= \
 "				float4 iMJaA = mtMul3(solverConstraint->m_relpos1CrossNormal,invInertiaWorldA);\n"
 "				float4 iMJlB = solverConstraint->m_contactNormal*rbB->m_invMass;//sign of normal?\n"
 "				float4 iMJaB = mtMul3(solverConstraint->m_relpos2CrossNormal,invInertiaWorldB);\n"
-"\n"
 "				float sum = dot3F4(iMJlA,solverConstraint->m_contactNormal);\n"
 "				sum += dot3F4(iMJaA,solverConstraint->m_relpos1CrossNormal);\n"
 "				sum += dot3F4(iMJlB,solverConstraint->m_contactNormal);\n"
@@ -854,17 +700,13 @@ static const char* solveConstraintRowsCL= \
 "					solverConstraint->m_jacDiagABInv = 0.f;\n"
 "				}\n"
 "			}\n"
-"\n"
-"\n"
 "			///fix rhs\n"
 "			///todo: add force/torque accelerators\n"
 "			{\n"
 "				float rel_vel;\n"
 "				float vel1Dotn = dot3F4(solverConstraint->m_contactNormal,rbA->m_linVel) + dot3F4(solverConstraint->m_relpos1CrossNormal,rbA->m_angVel);\n"
 "				float vel2Dotn = -dot3F4(solverConstraint->m_contactNormal,rbB->m_linVel) + dot3F4(solverConstraint->m_relpos2CrossNormal,rbB->m_angVel);\n"
-"\n"
 "				rel_vel = vel1Dotn+vel2Dotn;\n"
-"\n"
 "				float restitution = 0.f;\n"
 "				float positionalError = solverConstraint->m_rhs;//already filled in by getConstraintInfo2\n"
 "				float	velocityError = restitution - rel_vel * info2.m_damping;\n"
@@ -872,7 +714,6 @@ static const char* solveConstraintRowsCL= \
 "				float	velocityImpulse = velocityError *solverConstraint->m_jacDiagABInv;\n"
 "				solverConstraint->m_rhs = penetrationImpulse+velocityImpulse;\n"
 "				solverConstraint->m_appliedImpulse = 0.f;\n"
-"\n"
 "			}\n"
 "		}\n"
 "	}\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl
@@ -204,22 +204,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-	
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
@@ -2,37 +2,29 @@
 static const char* solveContactCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,43 +38,28 @@ static const char* solveContactCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define mymake_float4 (float4)\n"
 "//#define make_float2 (float2)\n"
 "//#define make_uint4 (uint4)\n"
 "//#define make_int4 (int4)\n"
 "//#define make_uint2 (uint2)\n"
 "//#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -90,10 +67,6 @@ static const char* solveContactCL= \
 "	float4 b1 = mymake_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -102,33 +75,17 @@ static const char* solveContactCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -139,54 +96,39 @@ static const char* solveContactCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -195,34 +137,13 @@ static const char* solveContactCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"	\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -231,7 +152,6 @@ static const char* solveContactCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -240,27 +160,20 @@ static const char* solveContactCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = mymake_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
 "{\n"
@@ -271,32 +184,25 @@ static const char* solveContactCL= \
 "	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
 "	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
 "}\n"
-"\n"
-"\n"
 "void solveContact(__global Constraint4* cs,\n"
 "				  float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
 "				  float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
-"\n"
 "void solveContact(__global Constraint4* cs,\n"
 "			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
 "			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
 "{\n"
 "	float minRambdaDt = 0;\n"
 "	float maxRambdaDt = FLT_MAX;\n"
-"\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
-"\n"
 "		float4 angular0, angular1, linear;\n"
 "		float4 r0 = cs->m_worldPos[ic] - posA;\n"
 "		float4 r1 = cs->m_worldPos[ic] - posB;\n"
 "		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
-"\n"
 "		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
 "			*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
 "		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
-"\n"
 "		{\n"
 "			float prevSum = cs->m_appliedRambdaDt[ic];\n"
 "			float updated = prevSum;\n"
@@ -306,19 +212,16 @@ static const char* solveContactCL= \
 "			rambdaDt = updated - prevSum;\n"
 "			cs->m_appliedRambdaDt[ic] = updated;\n"
 "		}\n"
-"\n"
 "		float4 linImp0 = invMassA*linear*rambdaDt;\n"
 "		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
 "		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
 "		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
-"\n"
 "		*linVelA += linImp0;\n"
 "		*angVelA += angImp0;\n"
 "		*linVelB += linImp1;\n"
 "		*angVelB += angImp1;\n"
 "	}\n"
 "}\n"
-"\n"
 "void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
 " void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
 "{\n"
@@ -347,29 +250,24 @@ static const char* solveContactCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
 "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
 "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
 "{\n"
 "	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"\n"
 "	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "			posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
-"\n"
 "  if (gBodies[aIdx].m_invMass)\n"
 "  {\n"
 "		gBodies[aIdx].m_linVel = linVelA;\n"
@@ -390,27 +288,18 @@ static const char* solveContactCL= \
 "		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
 "	\n"
 "	}\n"
-"\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void BatchSolveKernelContact(__global Body* gBodies,\n"
@@ -427,21 +316,16 @@ static const char* solveContactCL= \
 "	__local int ldsCurBatch;\n"
 "	__local int ldsNextBatch;\n"
 "	__local int ldsStart;\n"
-"\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
-"\n"
 "//	int gIdx = GET_GLOBAL_IDX;\n"
 "//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
 "	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
-"\n"
-"\n"
 "	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
 "	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
 "	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
 "	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
 "	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
-"\n"
 "	//int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
 "	//int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
 "	//int cellIdx = xIdx+yIdx*nSplit;\n"
@@ -450,23 +334,18 @@ static const char* solveContactCL= \
 "		return;\n"
 "	\n"
 "	\n"
-"	\n"
 "	const int start = gOffsets[cellIdx];\n"
 "	const int end = start + gN[cellIdx];\n"
 "	\n"
 "	\n"
 "	\n"
-"	\n"
 "	if( lIdx == 0 )\n"
 "	{\n"
 "		ldsCurBatch = 0;\n"
 "		ldsNextBatch = 0;\n"
 "		ldsStart = start;\n"
 "	}\n"
-"\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	int idx=ldsStart+lIdx;\n"
 "	while (ldsCurBatch < maxBatch)\n"
 "	{\n"
@@ -475,7 +354,6 @@ static const char* solveContactCL= \
 "			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
 "			{\n"
 "					solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"\n"
 "				 idx+=64;\n"
 "			} else\n"
 "			{\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl
@@ -204,22 +204,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
@@ -2,37 +2,29 @@
 static const char* solveFrictionCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
 "//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,43 +38,28 @@ static const char* solveFrictionCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define mymake_float4 (float4)\n"
 "//#define make_float2 (float2)\n"
 "//#define make_uint4 (uint4)\n"
 "//#define make_int4 (int4)\n"
 "//#define make_uint2 (uint2)\n"
 "//#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -90,10 +67,6 @@ static const char* solveFrictionCL= \
 "	float4 b1 = mymake_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -102,33 +75,17 @@ static const char* solveFrictionCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -139,54 +96,39 @@ static const char* solveFrictionCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -195,34 +137,13 @@ static const char* solveFrictionCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -231,7 +152,6 @@ static const char* solveFrictionCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -240,27 +160,20 @@ static const char* solveFrictionCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = mymake_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
 "{\n"
@@ -299,33 +212,26 @@ static const char* solveFrictionCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
-"\n"
 "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
 "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
 "{\n"
 "	float frictionCoeff = ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
 "	\n"
-"\n"
 "	{\n"
 "		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
 "		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
-"\n"
 "		float sum = 0;\n"
 "		for(int j=0; j<4; j++)\n"
 "		{\n"
@@ -338,7 +244,6 @@ static const char* solveFrictionCL= \
 "			minRambdaDt[j] = -maxRambdaDt[j];\n"
 "		}\n"
 "		\n"
-"		\n"
 "//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
 "		\n"
@@ -399,9 +304,7 @@ static const char* solveFrictionCL= \
 "		}\n"
 "		\n"
 "		\n"
-"		\n"
 "	}\n"
-"\n"
 "	if (gBodies[aIdx].m_invMass)\n"
 "	{\n"
 "		gBodies[aIdx].m_linVel = linVelA;\n"
@@ -421,25 +324,18 @@ static const char* solveFrictionCL= \
 "		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
 "	}\n"
 " \n"
-"\n"
 "}\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
 "void BatchSolveKernelFriction(__global Body* gBodies,\n"
@@ -456,39 +352,29 @@ static const char* solveFrictionCL= \
 "	__local int ldsCurBatch;\n"
 "	__local int ldsNextBatch;\n"
 "	__local int ldsStart;\n"
-"\n"
 "	int lIdx = GET_LOCAL_IDX;\n"
 "	int wgIdx = GET_GROUP_IDX;\n"
-"\n"
 "//	int gIdx = GET_GLOBAL_IDX;\n"
 "//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
 "	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
-"\n"
-"\n"
 "	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
 "	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
 "	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
 "	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
 "	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
 "	\n"
-"	\n"
 "	if( gN[cellIdx] == 0 ) \n"
 "		return;\n"
-"\n"
 "	const int start = gOffsets[cellIdx];\n"
 "	const int end = start + gN[cellIdx];\n"
 "	\n"
-"	\n"
 "	if( lIdx == 0 )\n"
 "	{\n"
 "		ldsCurBatch = 0;\n"
 "		ldsNextBatch = 0;\n"
 "		ldsStart = start;\n"
 "	}\n"
-"\n"
-"\n"
 "	GROUP_LDS_BARRIER;\n"
-"\n"
 "	int idx=ldsStart+lIdx;\n"
 "	while (ldsCurBatch < maxBatch)\n"
 "	{\n"
@@ -496,9 +382,7 @@ static const char* solveFrictionCL= \
 "		{\n"
 "			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
 "			{\n"
-"\n"
 "					solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"\n"
 "				 idx+=64;\n"
 "			} else\n"
 "			{\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl
@@ -14,6 +14,7 @@ subject to the following restrictions:
 */
 //Originally written by Takahiro Harada

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"

 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
@@ -403,22 +404,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
@@ -525,7 +511,7 @@ void btPlaneSpace1 (float4 n, float4* p, float4* q);

 void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
 	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, 
-	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,
+	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,
 	Constraint4* dstC )
 {
 	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
@@ -622,7 +608,7 @@ typedef struct

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, 
+void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, 
 int nContacts,
 float dt,
 float positionDrift,
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
@@ -2,37 +2,71 @@
 static const char* solverSetupCL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,22 +80,15 @@ static const char* solverSetupCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
@@ -71,52 +98,43 @@ static const char* solverSetupCL= \
 "	return native_divide(numerator, denominator);	\n"
 "//	return numerator/denominator;	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastDiv4(float4 numerator, float4 denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastSqrtf(float f2)\n"
 "{\n"
 "	return native_sqrt(f2);\n"
 "//	return sqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastRSqrt(float f2)\n"
 "{\n"
 "	return native_rsqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastLength4(float4 v)\n"
 "{\n"
 "	return fast_length(v);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "float sqrtf(float a)\n"
 "{\n"
 "//	return sqrt(a);\n"
 "	return native_sqrt(a);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -124,26 +142,22 @@ static const char* solverSetupCL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float length3(const float4 a)\n"
 "{\n"
 "	return sqrtf(dot3F4(a,a));\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot4(const float4 a, const float4 b)\n"
 "{\n"
 "	return dot( a, b );\n"
 "}\n"
-"\n"
 "//	for height\n"
 "__inline\n"
 "float dot3w1(const float4 point, const float4 eqn)\n"
 "{\n"
 "	return dot3F4(point,eqn) + eqn.w;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -152,14 +166,12 @@ static const char* solverSetupCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize4(const float4 a)\n"
 "{\n"
 "	float length = sqrtf(dot4(a, a));\n"
 "	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
 "{\n"
@@ -170,34 +182,25 @@ static const char* solverSetupCL= \
 "	eqn.w = -dot3F4(eqn,a);\n"
 "	return eqn;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero()\n"
 "{\n"
@@ -207,7 +210,6 @@ static const char* solverSetupCL= \
 "	m.m_row[2] = (float4)(0.f);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity()\n"
 "{\n"
@@ -217,7 +219,6 @@ static const char* solverSetupCL= \
 "	m.m_row[2] = (float4)(0,0,1,0);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -227,7 +228,6 @@ static const char* solverSetupCL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -248,7 +248,6 @@ static const char* solverSetupCL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -259,44 +258,32 @@ static const char* solverSetupCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 qtGetRotationMatrix(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -307,7 +294,6 @@ static const char* solverSetupCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -324,67 +310,52 @@ static const char* solverSetupCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 qtGetRotationMatrix(Quaternion quat)\n"
 "{\n"
 "	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
 "	Matrix3x3 out;\n"
-"\n"
 "	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
 "	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
 "	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
 "	out.m_row[0].w = 0.f;\n"
-"\n"
 "	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
 "	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
 "	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
 "	out.m_row[1].w = 0.f;\n"
-"\n"
 "	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
 "	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
 "	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
 "	out.m_row[2].w = 0.f;\n"
-"\n"
 "	return out;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -393,34 +364,13 @@ static const char* solverSetupCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -429,7 +379,6 @@ static const char* solverSetupCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -438,22 +387,16 @@ static const char* solverSetupCL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
-"\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = make_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
 "{\n"
@@ -465,26 +408,17 @@ static const char* solverSetupCL= \
 "	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
 "}\n"
 " \n"
-"\n"
-"\n"
-" \n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nContacts;\n"
@@ -492,8 +426,6 @@ static const char* solverSetupCL= \
 "	float m_scale;\n"
 "	int m_nSplit;\n"
 "} ConstBufferSSD;\n"
-"\n"
-"\n"
 "void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
 " void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
 "{\n"
@@ -522,84 +454,68 @@ static const char* solverSetupCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
-"\n"
 "void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
 "	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
-"	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
+"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,\n"
 "	Constraint4* dstC )\n"
 "{\n"
 "	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
 "	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
-"\n"
 "	float dtInv = 1.f/dt;\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "	}\n"
 "	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
-"\n"
-"\n"
 "	dstC->m_linear = -src->m_worldNormal;\n"
 "	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		float4 r0 = src->m_worldPos[ic] - posA;\n"
 "		float4 r1 = src->m_worldPos[ic] - posB;\n"
-"\n"
 "		if( ic >= src->m_worldNormal.w )//npoints\n"
 "		{\n"
 "			dstC->m_jacCoeffInv[ic] = 0.f;\n"
 "			continue;\n"
 "		}\n"
-"\n"
 "		float relVelN;\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
-"\n"
 "			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
 "				linVelA, angVelA, linVelB, angVelB);\n"
-"\n"
 "			float e = 0.f;//src->getRestituitionCoeff();\n"
 "			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
-"\n"
 "			dstC->m_b[ic] = e*relVelN;\n"
 "			//float penetration = src->m_worldPos[ic].w;\n"
 "			dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
 "			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "		}\n"
 "	}\n"
-"\n"
 "	if( src->m_worldNormal.w > 0 )//npoints\n"
 "	{	//	prepare friction\n"
 "		float4 center = make_float4(0.f);\n"
 "		for(int i=0; i<src->m_worldNormal.w; i++) \n"
 "			center += src->m_worldPos[i];\n"
 "		center /= (float)src->m_worldNormal.w;\n"
-"\n"
 "		float4 tangent[2];\n"
 "		btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
 "		\n"
 "		float4 r[2];\n"
 "		r[0] = center - posA;\n"
 "		r[1] = center - posB;\n"
-"\n"
 "		for(int i=0; i<2; i++)\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB );\n"
 "			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
 "		}\n"
 "		dstC->m_center = center;\n"
 "	}\n"
-"\n"
 "	for(int i=0; i<4; i++)\n"
 "	{\n"
 "		if( i<src->m_worldNormal.w )\n"
@@ -612,7 +528,6 @@ static const char* solverSetupCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nContacts;\n"
@@ -620,10 +535,9 @@ static const char* solverSetupCL= \
 "	float m_positionDrift;\n"
 "	float m_positionConstraintCoeff;\n"
 "} ConstBufferCTC;\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void ContactToConstraintKernel(__global Contact4* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, \n"
+"void ContactToConstraintKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global Shape* gShapes, __global Constraint4* gConstraintOut, \n"
 "int nContacts,\n"
 "float dt,\n"
 "float positionDrift,\n"
@@ -636,33 +550,23 @@ static const char* solverSetupCL= \
 "	{\n"
 "		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
 "		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
-"\n"
 "		float4 posA = gBodies[aIdx].m_pos;\n"
 "		float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "		float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "		float invMassA = gBodies[aIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "		float4 posB = gBodies[bIdx].m_pos;\n"
 "		float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "		float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "		float invMassB = gBodies[bIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"\n"
 "		Constraint4 cs;\n"
-"\n"
 "    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
 "			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,\n"
 "			&cs );\n"
 "		\n"
 "		cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
-"\n"
 "		gConstraintOut[gIdx] = cs;\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl
@@ -14,6 +14,8 @@ subject to the following restrictions:
 //Originally written by Takahiro Harada


+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
@@ -377,22 +379,7 @@ typedef struct
 	u32 m_paddings[1];
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;

-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;

 typedef struct
 {
@@ -435,7 +422,7 @@ typedef struct
 //	others
 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )
+void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )
 {
 	int nContacts = cb.x;
 	int gIdx = GET_GLOBAL_IDX;
@@ -448,7 +435,7 @@ void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __globa
 }

 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataChildShapeB(__global Contact4* contactsIn, __global int2* sortDataOut, int nContacts)
+void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -462,7 +449,7 @@ void SetDeterminismSortDataChildShapeB(__global Contact4* contactsIn, __global i
 }

 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataChildShapeA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)
+void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -478,7 +465,7 @@ void SetDeterminismSortDataChildShapeA(__global Contact4* contactsIn, __global i
 }

 __kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)
+void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -496,7 +483,7 @@ void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* s

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetDeterminismSortDataBodyB(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)
+void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)
 {
 	int gIdx = GET_GLOBAL_IDX;

@@ -552,7 +539,7 @@ static __constant const int gridTable8x8[] =

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, 
+void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, 
 int nContacts,float scale,int4 nSplit,int staticIdx)

 {
@@ -613,7 +600,7 @@ int nContacts,float scale,int4 nSplit,int staticIdx)

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb )
+void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )
 {
 	int gIdx = GET_GLOBAL_IDX;
 	if( gIdx < cb.x )
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
@@ -2,37 +2,71 @@
 static const char* solverSetup2CL= \
 "/*\n"
 "Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Takahiro Harada\n"
-"\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -46,22 +80,15 @@ static const char* solverSetup2CL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
@@ -71,52 +98,43 @@ static const char* solverSetup2CL= \
 "	return native_divide(numerator, denominator);	\n"
 "//	return numerator/denominator;	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastDiv4(float4 numerator, float4 denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastSqrtf(float f2)\n"
 "{\n"
 "	return native_sqrt(f2);\n"
 "//	return sqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastRSqrt(float f2)\n"
 "{\n"
 "	return native_rsqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastLength4(float4 v)\n"
 "{\n"
 "	return fast_length(v);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "float sqrtf(float a)\n"
 "{\n"
 "//	return sqrt(a);\n"
 "	return native_sqrt(a);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -124,26 +142,22 @@ static const char* solverSetup2CL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float length3(const float4 a)\n"
 "{\n"
 "	return sqrtf(dot3F4(a,a));\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot4(const float4 a, const float4 b)\n"
 "{\n"
 "	return dot( a, b );\n"
 "}\n"
-"\n"
 "//	for height\n"
 "__inline\n"
 "float dot3w1(const float4 point, const float4 eqn)\n"
 "{\n"
 "	return dot3F4(point,eqn) + eqn.w;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -152,14 +166,12 @@ static const char* solverSetup2CL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize4(const float4 a)\n"
 "{\n"
 "	float length = sqrtf(dot4(a, a));\n"
 "	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
 "{\n"
@@ -170,34 +182,25 @@ static const char* solverSetup2CL= \
 "	eqn.w = -dot3F4(eqn,a);\n"
 "	return eqn;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero()\n"
 "{\n"
@@ -207,7 +210,6 @@ static const char* solverSetup2CL= \
 "	m.m_row[2] = (float4)(0.f);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity()\n"
 "{\n"
@@ -217,7 +219,6 @@ static const char* solverSetup2CL= \
 "	m.m_row[2] = (float4)(0,0,1,0);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -227,7 +228,6 @@ static const char* solverSetup2CL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -248,7 +248,6 @@ static const char* solverSetup2CL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -259,43 +258,30 @@ static const char* solverSetup2CL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -306,7 +292,6 @@ static const char* solverSetup2CL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -323,43 +308,33 @@ static const char* solverSetup2CL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -368,34 +343,13 @@ static const char* solverSetup2CL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
-"\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings[1];\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nConstraints;\n"
@@ -404,7 +358,6 @@ static const char* solverSetup2CL= \
 "	int m_nSplit;\n"
 "//	int m_paddings[1];\n"
 "} ConstBuffer;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_solveFriction;\n"
@@ -414,46 +367,34 @@ static const char* solverSetup2CL= \
 "//	int m_paddings[1];\n"
 "} ConstBufferBatchSolve;\n"
 " \n"
-"\n"
-" \n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	int m_valInt0;\n"
 "	int m_valInt1;\n"
 "	int m_valInt2;\n"
 "	int m_valInt3;\n"
-"\n"
 "	float m_val0;\n"
 "	float m_val1;\n"
 "	float m_val2;\n"
 "	float m_val3;\n"
 "} SolverDebugInfo;\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "//	others\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void ReorderContactKernel(__global Contact4* in, __global Contact4* out, __global int2* sortData, int4 cb )\n"
+"void ReorderContactKernel(__global struct b3Contact4Data* in, __global struct b3Contact4Data* out, __global int2* sortData, int4 cb )\n"
 "{\n"
 "	int nContacts = cb.x;\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int srcIdx = sortData[gIdx].y;\n"
 "		out[gIdx] = in[srcIdx];\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataChildShapeB(__global Contact4* contactsIn, __global int2* sortDataOut, int nContacts)\n"
+"void SetDeterminismSortDataChildShapeB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sd;\n"
@@ -462,12 +403,10 @@ static const char* solverSetup2CL= \
 "		sortDataOut[gIdx] = sd;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataChildShapeA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"void SetDeterminismSortDataChildShapeA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sdIn;\n"
@@ -478,12 +417,10 @@ static const char* solverSetup2CL= \
 "		sortDataInOut[gIdx] = sdOut;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel __attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataBodyA(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"void SetDeterminismSortDataBodyA(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sdIn;\n"
@@ -494,14 +431,11 @@ static const char* solverSetup2CL= \
 "		sortDataInOut[gIdx] = sdOut;\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetDeterminismSortDataBodyB(__global Contact4* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
+"void SetDeterminismSortDataBodyB(__global struct b3Contact4Data* contactsIn, __global int2* sortDataInOut, int nContacts)\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
 "	if( gIdx < nContacts )\n"
 "	{\n"
 "		int2 sdIn;\n"
@@ -512,10 +446,6 @@ static const char* solverSetup2CL= \
 "		sortDataInOut[gIdx] = sdOut;\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	int m_nContacts;\n"
@@ -523,8 +453,6 @@ static const char* solverSetup2CL= \
 "	float m_scale;\n"
 "	int m_nSplit;\n"
 "} ConstBufferSSD;\n"
-"\n"
-"\n"
 "static __constant const int gridTable4x4[] = \n"
 "{\n"
 "    0,1,17,16,\n"
@@ -532,7 +460,6 @@ static const char* solverSetup2CL= \
 "	17,18,32,3,\n"
 "	16,19,3,34\n"
 "};\n"
-"\n"
 "static __constant const int gridTable8x8[] = \n"
 "{\n"
 "	  0,  2,  3, 16, 17, 18, 19,  1,\n"
@@ -545,18 +472,12 @@ static const char* solverSetup2CL= \
 "	197,27,214,213,212,199,198,196\n"
 "	\n"
 "};\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define USE_SPATIAL_BATCHING 1\n"
 "#define USE_4x4_GRID 1\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void SetSortDataKernel(__global Contact4* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n"
+"void SetSortDataKernel(__global struct b3Contact4Data* gContact, __global Body* gBodies, __global int2* gSortDataOut, \n"
 "int nContacts,float scale,int4 nSplit,int staticIdx)\n"
-"\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	\n"
@@ -564,13 +485,10 @@ static const char* solverSetup2CL= \
 "	{\n"
 "		int aPtrAndSignBit  = gContact[gIdx].m_bodyAPtrAndSignBit;\n"
 "		int bPtrAndSignBit  = gContact[gIdx].m_bodyBPtrAndSignBit;\n"
-"\n"
 "		int aIdx = abs(aPtrAndSignBit );\n"
 "		int bIdx = abs(bPtrAndSignBit);\n"
-"\n"
 "		bool aStatic = (aPtrAndSignBit<0) ||(aPtrAndSignBit==staticIdx);\n"
 "		bool bStatic = (bPtrAndSignBit<0) ||(bPtrAndSignBit==staticIdx);\n"
-"\n"
 "#if USE_SPATIAL_BATCHING		\n"
 "		int idx = (aStatic)? bIdx: aIdx;\n"
 "		float4 p = gBodies[idx].m_pos;\n"
@@ -587,7 +505,6 @@ static const char* solverSetup2CL= \
 "			aa = bb;\n"
 "		if (bStatic)\n"
 "			bb = aa;\n"
-"\n"
 "		int gridIndex = aa + bb*4;\n"
 "		int newIndex = gridTable4x4[gridIndex];\n"
 "	#else//USE_4x4_GRID\n"
@@ -597,13 +514,10 @@ static const char* solverSetup2CL= \
 "			aa = bb;\n"
 "		if (bStatic)\n"
 "			bb = aa;\n"
-"\n"
 "		int gridIndex = aa + bb*8;\n"
 "		int newIndex = gridTable8x8[gridIndex];\n"
 "	#endif//USE_4x4_GRID\n"
 "#endif//USE_SPATIAL_BATCHING\n"
-"\n"
-"\n"
 "		gSortDataOut[gIdx].x = newIndex;\n"
 "		gSortDataOut[gIdx].y = gIdx;\n"
 "	}\n"
@@ -612,10 +526,9 @@ static const char* solverSetup2CL= \
 "		gSortDataOut[gIdx].x = 0xffffffff;\n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void CopyConstraintKernel(__global Contact4* gIn, __global Contact4* gOut, int4 cb )\n"
+"void CopyConstraintKernel(__global struct b3Contact4Data* gIn, __global struct b3Contact4Data* gOut, int4 cb )\n"
 "{\n"
 "	int gIdx = GET_GLOBAL_IDX;\n"
 "	if( gIdx < cb.x )\n"
@@ -623,7 +536,4 @@ static const char* solverSetup2CL= \
 "		gOut[gIdx] = gIn[gIdx];\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.cl
@@ -13,6 +13,8 @@ subject to the following restrictions:
 */
 //Originally written by Erwin Coumans

+#include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
+
 #pragma OPENCL EXTENSION cl_amd_printf : enable
 #pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
@@ -380,25 +382,10 @@ typedef struct
 	u32 m_paddings;
 } Constraint4;

-typedef struct
-{
-	float4 m_worldPos[4];
-	float4 m_worldNormal;
-	u32 m_coeffs;
-	int m_batchIdx;
-
-	int m_bodyAPtrAndSignBit;
-	int m_bodyBPtrAndSignBit;
-
-	int	m_childIndexA;
-	int	m_childIndexB;
-	int m_unused1;
-	int m_unused2;
-
-} Contact4;


-__kernel void CountBodiesKernel(__global Contact4* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)
+
+__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)
 {
 	int i = GET_GLOBAL_IDX;
 	
@@ -844,7 +831,7 @@ __kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* of

 void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,
 	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, 
-	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,
+	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,
 	Constraint4* dstC )
 {
 	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);
@@ -934,7 +921,7 @@ void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVe

 __kernel
 __attribute__((reqd_work_group_size(WG_SIZE,1,1)))
-void ContactToConstraintSplitKernel(__global const Contact4* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, 
+void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, 
 __global const unsigned int* bodyCount,
 int nContacts,
 float dt,
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
@@ -2,36 +2,71 @@
 static const char* solverUtilsCL= \
 "/*\n"
 "Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
-"\n"
 "This software is provided 'as-is', without any express or implied warranty.\n"
 "In no event will the authors be held liable for any damages arising from the use of this software.\n"
 "Permission is granted to anyone to use this software for any purpose, \n"
 "including commercial applications, and to alter it and redistribute it freely, \n"
 "subject to the following restrictions:\n"
-"\n"
 "1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
 "2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
 "3. This notice may not be removed or altered from any source distribution.\n"
 "*/\n"
 "//Originally written by Erwin Coumans\n"
-"\n"
+"#ifndef B3_CONTACT4DATA_H\n"
+"#define B3_CONTACT4DATA_H\n"
+"#ifndef B3_FLOAT4_H\n"
+"#define B3_FLOAT4_H\n"
+"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+"#define B3_PLATFORM_DEFINITIONS_H\n"
+"struct MyTest\n"
+"{\n"
+"	int bla;\n"
+"};\n"
+"#endif\n"
+"#ifdef __cplusplus\n"
+"#else//bla\n"
+"	typedef float4	b3Float4;\n"
+"#endif \n"
+"#endif //B3_FLOAT4_H\n"
+"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+"struct b3Contact4Data\n"
+"{\n"
+"	b3Float4	m_worldPos[4];\n"
+"//	b3Float4	m_localPosB[4];\n"
+"	b3Float4	m_worldNormal;	//	w: m_nPoints\n"
+"	unsigned short  m_restituitionCoeffCmp;\n"
+"	unsigned short  m_frictionCoeffCmp;\n"
+"	int m_batchIdx;\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"	int	m_childIndexA;\n"
+"	int	m_childIndexB;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"	b3Float4	m_localPosA;\n"
+"};\n"
+"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+"{\n"
+"	return (int)contact->m_worldNormal.w;\n"
+"};\n"
+"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+"{\n"
+"	contact->m_worldNormal.w = (float)numPoints;\n"
+"};\n"
+"#endif //B3_CONTACT4DATA_H\n"
 "#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
 "#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"\n"
-"\n"
 "#ifdef cl_ext_atomic_counters_32\n"
 "#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
 "#else\n"
 "#define counter32_t volatile global int*\n"
 "#endif\n"
-"\n"
 "typedef unsigned int u32;\n"
 "typedef unsigned short u16;\n"
 "typedef unsigned char u8;\n"
-"\n"
 "#define GET_GROUP_IDX get_group_id(0)\n"
 "#define GET_LOCAL_IDX get_local_id(0)\n"
 "#define GET_GLOBAL_IDX get_global_id(0)\n"
@@ -45,22 +80,15 @@ static const char* solverUtilsCL= \
 "#define AtomAdd(x, value) atom_add(&(x), value)\n"
 "#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
 "#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"\n"
-"\n"
 "#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"\n"
 "#define make_float4 (float4)\n"
 "#define make_float2 (float2)\n"
 "#define make_uint4 (uint4)\n"
 "#define make_int4 (int4)\n"
 "#define make_uint2 (uint2)\n"
 "#define make_int2 (int2)\n"
-"\n"
-"\n"
 "#define max2 max\n"
 "#define min2 min\n"
-"\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Vector\n"
 "///////////////////////////////////////\n"
@@ -70,57 +98,47 @@ static const char* solverUtilsCL= \
 "	return native_divide(numerator, denominator);	\n"
 "//	return numerator/denominator;	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastDiv4(float4 numerator, float4 denominator)\n"
 "{\n"
 "	return native_divide(numerator, denominator);	\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastSqrtf(float f2)\n"
 "{\n"
 "	return native_sqrt(f2);\n"
 "//	return sqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastRSqrt(float f2)\n"
 "{\n"
 "	return native_rsqrt(f2);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float fastLength4(float4 v)\n"
 "{\n"
 "	return fast_length(v);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 fastNormalize4(float4 v)\n"
 "{\n"
 "	return fast_normalize(v);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "float sqrtf(float a)\n"
 "{\n"
 "//	return sqrt(a);\n"
 "	return native_sqrt(a);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a1, float4 b1)\n"
 "{\n"
-"\n"
 "	float4 	a=make_float4(a1.xyz,0.f);\n"
 "	float4 	b=make_float4(b1.xyz,0.f);\n"
 "	//float4 	a=a1;\n"
 "	//float4 	b=b1;\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -128,26 +146,22 @@ static const char* solverUtilsCL= \
 "	float4 b1 = make_float4(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float length3(const float4 a)\n"
 "{\n"
 "	return sqrtf(dot3F4(a,a));\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot4(const float4 a, const float4 b)\n"
 "{\n"
 "	return dot( a, b );\n"
 "}\n"
-"\n"
 "//	for height\n"
 "__inline\n"
 "float dot3w1(const float4 point, const float4 eqn)\n"
 "{\n"
 "	return dot3F4(point,eqn) + eqn.w;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize3(const float4 a)\n"
 "{\n"
@@ -156,14 +170,12 @@ static const char* solverUtilsCL= \
 "//	float length = sqrtf(dot3F4(a, a));\n"
 "//	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 normalize4(const float4 a)\n"
 "{\n"
 "	float length = sqrtf(dot4(a, a));\n"
 "	return 1.f/length * a;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 createEquation(const float4 a, const float4 b, const float4 c)\n"
 "{\n"
@@ -174,34 +186,25 @@ static const char* solverUtilsCL= \
 "	eqn.w = -dot3F4(eqn,a);\n"
 "	return eqn;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Matrix3x3\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_row[3];\n"
 "}Matrix3x3;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity();\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtZero()\n"
 "{\n"
@@ -211,7 +214,6 @@ static const char* solverUtilsCL= \
 "	m.m_row[2] = (float4)(0.f);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtIdentity()\n"
 "{\n"
@@ -221,7 +223,6 @@ static const char* solverUtilsCL= \
 "	m.m_row[2] = (float4)(0,0,1,0);\n"
 "	return m;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -231,7 +232,6 @@ static const char* solverUtilsCL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -252,7 +252,6 @@ static const char* solverUtilsCL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul1(Matrix3x3 a, float4 b)\n"
 "{\n"
@@ -263,43 +262,30 @@ static const char* solverUtilsCL= \
 "	ans.w = 0.f;\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 mtMul3(float4 a, Matrix3x3 b)\n"
 "{\n"
 "	float4 colx = make_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
 "	float4 coly = make_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
 "	float4 colz = make_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"\n"
 "	float4 ans;\n"
 "	ans.x = dot3F4( a, colx );\n"
 "	ans.y = dot3F4( a, coly );\n"
 "	ans.z = dot3F4( a, colz );\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "///////////////////////////////////////\n"
 "//	Quaternion\n"
 "///////////////////////////////////////\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in);\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec);\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q);\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -310,7 +296,6 @@ static const char* solverUtilsCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtNormalize(Quaternion in)\n"
 "{\n"
@@ -327,43 +312,33 @@ static const char* solverUtilsCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtInvRotate(const Quaternion q, float4 vec)\n"
 "{\n"
 "	return qtRotate( qtInvert( q ), vec );\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "#define WG_SIZE 64\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	Quaternion m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_shapeIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_linear;\n"
@@ -372,35 +347,14 @@ static const char* solverUtilsCL= \
 "	float m_jacCoeffInv[4];\n"
 "	float m_b[4];\n"
 "	float m_appliedRambdaDt[4];\n"
-"\n"
 "	float m_fJacCoeffInv[2];	\n"
 "	float m_fAppliedRambdaDt[2];	\n"
-"\n"
 "	u32 m_bodyA;\n"
 "	u32 m_bodyB;\n"
 "	int m_batchIdx;\n"
 "	u32 m_paddings;\n"
 "} Constraint4;\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_worldNormal;\n"
-"	u32 m_coeffs;\n"
-"	int m_batchIdx;\n"
-"\n"
-"	int m_bodyAPtrAndSignBit;\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"\n"
-"} Contact4;\n"
-"\n"
-"\n"
-"__kernel void CountBodiesKernel(__global Contact4* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n"
+"__kernel void CountBodiesKernel(__global struct b3Contact4Data* manifoldPtr, __global unsigned int* bodyCount, __global int2* contactConstraintOffsets, int numContactManifolds, int fixedBodyIndex)\n"
 "{\n"
 "	int i = GET_GLOBAL_IDX;\n"
 "	\n"
@@ -423,7 +377,6 @@ static const char* solverUtilsCL= \
 "		} \n"
 "	}\n"
 "}\n"
-"\n"
 "__kernel void ClearVelocitiesKernel(__global float4* linearVelocities,__global float4* angularVelocities, int numSplitBodies)\n"
 "{\n"
 "	int i = GET_GLOBAL_IDX;\n"
@@ -434,8 +387,6 @@ static const char* solverUtilsCL= \
 "		angularVelocities[i] = make_float4(0);\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void AverageVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
 "__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
 "{\n"
@@ -465,23 +416,16 @@ static const char* solverUtilsCL= \
 "		}//bodies[i].m_invMass\n"
 "	}//i<numBodies\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
 "{\n"
 "	*linear = make_float4(-n.xyz,0.f);\n"
 "	*angular0 = -cross3(r0, n);\n"
 "	*angular1 = cross3(r1, n);\n"
 "}\n"
-"\n"
-"\n"
 "float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
 "{\n"
 "	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
 "}\n"
-"\n"
-"\n"
 "float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
 "					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1, float countA, float countB)\n"
 "{\n"
@@ -492,8 +436,6 @@ static const char* solverUtilsCL= \
 "	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
 "	return -1.f/((jmj0+jmj1)*countA+(jmj2+jmj3)*countB);\n"
 "}\n"
-"\n"
-"\n"
 "void btPlaneSpace1 (float4 n, float4* p, float4* q);\n"
 " void btPlaneSpace1 (float4 n, float4* p, float4* q)\n"
 "{\n"
@@ -522,11 +464,6 @@ static const char* solverUtilsCL= \
 "	q[0].z = a*k;\n"
 "  }\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "void solveContact(__global Constraint4* cs,\n"
 "			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
 "			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB,\n"
@@ -534,23 +471,18 @@ static const char* solverUtilsCL= \
 "{\n"
 "	float minRambdaDt = 0;\n"
 "	float maxRambdaDt = FLT_MAX;\n"
-"\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
-"\n"
 "		float4 angular0, angular1, linear;\n"
 "		float4 r0 = cs->m_worldPos[ic] - posA;\n"
 "		float4 r1 = cs->m_worldPos[ic] - posB;\n"
 "		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
 "	\n"
-"\n"
-"\n"
 "		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
 "			*linVelA+*dLinVelA, *angVelA+*dAngVelA, *linVelB+*dLinVelB, *angVelB+*dAngVelB ) + cs->m_b[ic];\n"
 "		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
 "		\n"
-"		\n"
 "		{\n"
 "			float prevSum = cs->m_appliedRambdaDt[ic];\n"
 "			float updated = prevSum;\n"
@@ -561,13 +493,11 @@ static const char* solverUtilsCL= \
 "			cs->m_appliedRambdaDt[ic] = updated;\n"
 "		}\n"
 "			\n"
-"			\n"
 "		float4 linImp0 = invMassA*linear*rambdaDt;\n"
 "		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
 "		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
 "		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
 "		\n"
-"		\n"
 "		if (invMassA)\n"
 "		{\n"
 "			*dLinVelA += linImp0;\n"
@@ -580,33 +510,25 @@ static const char* solverUtilsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "//	solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,contactConstraintOffsets,offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
-"\n"
-"\n"
 "void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs, \n"
 "__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
 "__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
 "{\n"
-"\n"
 "	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
 "			\n"
-"			\n"
 "	float4 dLinVelA = make_float4(0,0,0,0);\n"
 "	float4 dAngVelA = make_float4(0,0,0,0);\n"
 "	float4 dLinVelB = make_float4(0,0,0,0);\n"
@@ -621,20 +543,16 @@ static const char* solverUtilsCL= \
 "		dLinVelA = deltaLinearVelocities[splitIndexA];\n"
 "		dAngVelA = deltaAngularVelocities[splitIndexA];\n"
 "	}\n"
-"\n"
 "	int bodyOffsetB = offsetSplitBodies[bIdx];\n"
 "	int constraintOffsetB = contactConstraintOffsets[0].y;\n"
 "	int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
-"\n"
 "	if (invMassB)\n"
 "	{\n"
 "		dLinVelB = deltaLinearVelocities[splitIndexB];\n"
 "		dAngVelB = deltaAngularVelocities[splitIndexB];\n"
 "	}\n"
-"\n"
 "	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "			posB, &linVelB, &angVelB, invMassB, invInertiaB ,&dLinVelA, &dAngVelA, &dLinVelB, &dAngVelB);\n"
-"\n"
 "	if (invMassA)\n"
 "	{\n"
 "		deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
@@ -645,10 +563,7 @@ static const char* solverUtilsCL= \
 "		deltaLinearVelocities[splitIndexB] = dLinVelB;\n"
 "		deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
 "	}\n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void SolveContactJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
 "__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
 "float deltaTime, float positionDrift, float positionConstraintCoeff, int fixedBodyIndex, int numManifolds\n"
@@ -660,10 +575,6 @@ static const char* solverUtilsCL= \
 "		solveContactConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs,\n"
 "							__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
 "							__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities)\n"
@@ -671,21 +582,17 @@ static const char* solverUtilsCL= \
 "	float frictionCoeff = 0.7f;//ldsCs[0].m_linear.w;\n"
 "	int aIdx = ldsCs[0].m_bodyA;\n"
 "	int bIdx = ldsCs[0].m_bodyB;\n"
-"\n"
-"\n"
 "	float4 posA = gBodies[aIdx].m_pos;\n"
 "	float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "	float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "	float invMassA = gBodies[aIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "	float4 posB = gBodies[bIdx].m_pos;\n"
 "	float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "	float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "	float invMassB = gBodies[bIdx].m_invMass;\n"
 "	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
 "	\n"
-"\n"
 "	float4 dLinVelA = make_float4(0,0,0,0);\n"
 "	float4 dAngVelA = make_float4(0,0,0,0);\n"
 "	float4 dLinVelB = make_float4(0,0,0,0);\n"
@@ -700,24 +607,17 @@ static const char* solverUtilsCL= \
 "		dLinVelA = deltaLinearVelocities[splitIndexA];\n"
 "		dAngVelA = deltaAngularVelocities[splitIndexA];\n"
 "	}\n"
-"\n"
 "	int bodyOffsetB = offsetSplitBodies[bIdx];\n"
 "	int constraintOffsetB = contactConstraintOffsets[0].y;\n"
 "	int splitIndexB= bodyOffsetB+constraintOffsetB;\n"
-"\n"
 "	if (invMassB)\n"
 "	{\n"
 "		dLinVelB = deltaLinearVelocities[splitIndexB];\n"
 "		dAngVelB = deltaAngularVelocities[splitIndexB];\n"
 "	}\n"
-"\n"
-"\n"
-"\n"
-"\n"
 "	{\n"
 "		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
 "		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
-"\n"
 "		float sum = 0;\n"
 "		for(int j=0; j<4; j++)\n"
 "		{\n"
@@ -730,7 +630,6 @@ static const char* solverUtilsCL= \
 "			minRambdaDt[j] = -maxRambdaDt[j];\n"
 "		}\n"
 "		\n"
-"		\n"
 "//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
 "//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
 "		\n"
@@ -791,9 +690,7 @@ static const char* solverUtilsCL= \
 "		}\n"
 "		\n"
 "		\n"
-"		\n"
 "	}\n"
-"\n"
 "	if (invMassA)\n"
 "	{\n"
 "		deltaLinearVelocities[splitIndexA] = dLinVelA;\n"
@@ -805,10 +702,7 @@ static const char* solverUtilsCL= \
 "		deltaAngularVelocities[splitIndexB] = dAngVelB;\n"
 "	}\n"
 " \n"
-"\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void SolveFrictionJacobiKernel(__global Constraint4* gConstraints, __global Body* gBodies, __global Shape* gShapes ,\n"
 "										__global int2* contactConstraintOffsets,__global unsigned int* offsetSplitBodies,\n"
 "										__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities,\n"
@@ -821,8 +715,6 @@ static const char* solverUtilsCL= \
 "		solveFrictionConstraint( gBodies, gShapes, &gConstraints[i] ,&contactConstraintOffsets[i],offsetSplitBodies, deltaLinearVelocities, deltaAngularVelocities);\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void UpdateBodyVelocitiesKernel(__global Body* gBodies,__global int* offsetSplitBodies,__global const unsigned int* bodyCount,\n"
 "									__global float4* deltaLinearVelocities, __global float4* deltaAngularVelocities, int numBodies)\n"
 "{\n"
@@ -841,85 +733,68 @@ static const char* solverUtilsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "void setConstraint4( const float4 posA, const float4 linVelA, const float4 angVelA, float invMassA, const Matrix3x3 invInertiaA,\n"
 "	const float4 posB, const float4 linVelB, const float4 angVelB, float invMassB, const Matrix3x3 invInertiaB, \n"
-"	__global Contact4* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n"
+"	__global struct b3Contact4Data* src, float dt, float positionDrift, float positionConstraintCoeff,float countA, float countB,\n"
 "	Constraint4* dstC )\n"
 "{\n"
 "	dstC->m_bodyA = abs(src->m_bodyAPtrAndSignBit);\n"
 "	dstC->m_bodyB = abs(src->m_bodyBPtrAndSignBit);\n"
-"\n"
 "	float dtInv = 1.f/dt;\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "	}\n"
 "	dstC->m_fJacCoeffInv[0] = dstC->m_fJacCoeffInv[1] = 0.f;\n"
-"\n"
-"\n"
 "	dstC->m_linear = -src->m_worldNormal;\n"
 "	dstC->m_linear.w = 0.7f ;//src->getFrictionCoeff() );\n"
 "	for(int ic=0; ic<4; ic++)\n"
 "	{\n"
 "		float4 r0 = src->m_worldPos[ic] - posA;\n"
 "		float4 r1 = src->m_worldPos[ic] - posB;\n"
-"\n"
 "		if( ic >= src->m_worldNormal.w )//npoints\n"
 "		{\n"
 "			dstC->m_jacCoeffInv[ic] = 0.f;\n"
 "			continue;\n"
 "		}\n"
-"\n"
 "		float relVelN;\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(src->m_worldNormal, r0, r1, &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB , countA, countB);\n"
-"\n"
 "			relVelN = calcRelVel(linear, -linear, angular0, angular1,\n"
 "				linVelA, angVelA, linVelB, angVelB);\n"
-"\n"
 "			float e = 0.f;//src->getRestituitionCoeff();\n"
 "			if( relVelN*relVelN < 0.004f ) e = 0.f;\n"
-"\n"
 "			dstC->m_b[ic] = e*relVelN;\n"
 "			//float penetration = src->m_worldPos[ic].w;\n"
 "			dstC->m_b[ic] += (src->m_worldPos[ic].w + positionDrift)*positionConstraintCoeff*dtInv;\n"
 "			dstC->m_appliedRambdaDt[ic] = 0.f;\n"
 "		}\n"
 "	}\n"
-"\n"
 "	if( src->m_worldNormal.w > 0 )//npoints\n"
 "	{	//	prepare friction\n"
 "		float4 center = make_float4(0.f);\n"
 "		for(int i=0; i<src->m_worldNormal.w; i++) \n"
 "			center += src->m_worldPos[i];\n"
 "		center /= (float)src->m_worldNormal.w;\n"
-"\n"
 "		float4 tangent[2];\n"
 "		btPlaneSpace1(src->m_worldNormal,&tangent[0],&tangent[1]);\n"
 "		\n"
 "		float4 r[2];\n"
 "		r[0] = center - posA;\n"
 "		r[1] = center - posB;\n"
-"\n"
 "		for(int i=0; i<2; i++)\n"
 "		{\n"
 "			float4 linear, angular0, angular1;\n"
 "			setLinearAndAngular(tangent[i], r[0], r[1], &linear, &angular0, &angular1);\n"
-"\n"
 "			dstC->m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,\n"
 "				invMassA, &invInertiaA, invMassB, &invInertiaB ,countA, countB);\n"
 "			dstC->m_fAppliedRambdaDt[i] = 0.f;\n"
 "		}\n"
 "		dstC->m_center = center;\n"
 "	}\n"
-"\n"
 "	for(int i=0; i<4; i++)\n"
 "	{\n"
 "		if( i<src->m_worldNormal.w )\n"
@@ -932,11 +807,9 @@ static const char* solverUtilsCL= \
 "		}\n"
 "	}\n"
 "}\n"
-"\n"
-"\n"
 "__kernel\n"
 "__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void ContactToConstraintSplitKernel(__global const Contact4* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n"
+"void ContactToConstraintSplitKernel(__global const struct b3Contact4Data* gContact, __global const Body* gBodies, __global const Shape* gShapes, __global Constraint4* gConstraintOut, \n"
 "__global const unsigned int* bodyCount,\n"
 "int nContacts,\n"
 "float dt,\n"
@@ -950,30 +823,24 @@ static const char* solverUtilsCL= \
 "	{\n"
 "		int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);\n"
 "		int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);\n"
-"\n"
 "		float4 posA = gBodies[aIdx].m_pos;\n"
 "		float4 linVelA = gBodies[aIdx].m_linVel;\n"
 "		float4 angVelA = gBodies[aIdx].m_angVel;\n"
 "		float invMassA = gBodies[aIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"\n"
 "		float4 posB = gBodies[bIdx].m_pos;\n"
 "		float4 linVelB = gBodies[bIdx].m_linVel;\n"
 "		float4 angVelB = gBodies[bIdx].m_angVel;\n"
 "		float invMassB = gBodies[bIdx].m_invMass;\n"
 "		Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"\n"
 "		Constraint4 cs;\n"
-"\n"
 "		float countA = invMassA ? (float)bodyCount[aIdx] : 1;\n"
 "		float countB = invMassB ? (float)bodyCount[bIdx] : 1;\n"
-"\n"
 "    	setConstraint4( posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,\n"
 "			&gContact[gIdx], dt, positionDrift, positionConstraintCoeff,countA,countB,\n"
 "			&cs  );\n"
 "		\n"
 "		cs.m_batchIdx = gContact[gIdx].m_batchIdx;\n"
-"\n"
 "		gConstraintOut[gIdx] = cs;\n"
 "	}\n"
 "}\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
@@ -1,15 +1,12 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
 static const char* updateAabbsKernelCL= \
 "#define SHAPE_CONVEX_HULL 3\n"
-"\n"
 "typedef float4 Quaternion;\n"
-"\n"
 "__inline\n"
 "float4 cross3(float4 a, float4 b)\n"
 "{\n"
 "	return cross(a,b);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float dot3F4(float4 a, float4 b)\n"
 "{\n"
@@ -17,8 +14,6 @@ static const char* updateAabbsKernelCL= \
 "	float4 b1 = (float4)(b.xyz,0.f);\n"
 "	return dot(a1, b1);\n"
 "}\n"
-"\n"
-"\n"
 "__inline\n"
 "Quaternion qtMul(Quaternion a, Quaternion b)\n"
 "{\n"
@@ -28,13 +23,11 @@ static const char* updateAabbsKernelCL= \
 "	ans.w = a.w*b.w - dot3F4(a, b);\n"
 "	return ans;\n"
 "}\n"
-"\n"
 "__inline\n"
 "Quaternion qtInvert(Quaternion q)\n"
 "{\n"
 "	return (Quaternion)(-q.xyz, q.w);\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 qtRotate(Quaternion q, float4 vec)\n"
 "{\n"
@@ -44,34 +37,27 @@ static const char* updateAabbsKernelCL= \
 "	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
-"\n"
 "__inline\n"
 "float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
 "{\n"
 "	return qtRotate( *orientation, *p ) + (*translation);\n"
 "}\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4	m_row[3];\n"
 "} Matrix3x3;\n"
-"\n"
 "typedef unsigned int u32;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	float4 m_pos;\n"
 "	float4 m_quat;\n"
 "	float4 m_linVel;\n"
 "	float4 m_angVel;\n"
-"\n"
 "	u32 m_collidableIdx;\n"
 "	float m_invMass;\n"
 "	float m_restituitionCoeff;\n"
 "	float m_frictionCoeff;\n"
 "} Body;\n"
-"\n"
 "typedef struct Collidable\n"
 "{\n"
 "	int m_unused1;\n"
@@ -79,40 +65,30 @@ static const char* updateAabbsKernelCL= \
 "	int m_shapeType;\n"
 "	int m_shapeIndex;\n"
 "} Collidable;\n"
-"\n"
-"\n"
 "typedef struct\n"
 "{\n"
 "	Matrix3x3 m_invInertia;\n"
 "	Matrix3x3 m_initInvInertia;\n"
 "} Shape;\n"
-"\n"
-"\n"
 "__inline\n"
 "Matrix3x3 qtGetRotationMatrix(float4 quat)\n"
 "{\n"
 "	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
 "	Matrix3x3 out;\n"
-"\n"
 "	out.m_row[0].x=fabs(1-2*quat2.y-2*quat2.z);\n"
 "	out.m_row[0].y=fabs(2*quat.x*quat.y-2*quat.w*quat.z);\n"
 "	out.m_row[0].z=fabs(2*quat.x*quat.z+2*quat.w*quat.y);\n"
 "	out.m_row[0].w = 0.f;\n"
-"\n"
 "	out.m_row[1].x=fabs(2*quat.x*quat.y+2*quat.w*quat.z);\n"
 "	out.m_row[1].y=fabs(1-2*quat2.x-2*quat2.z);\n"
 "	out.m_row[1].z=fabs(2*quat.y*quat.z-2*quat.w*quat.x);\n"
 "	out.m_row[1].w = 0.f;\n"
-"\n"
 "	out.m_row[2].x=fabs(2*quat.x*quat.z-2*quat.w*quat.y);\n"
 "	out.m_row[2].y=fabs(2*quat.y*quat.z+2*quat.w*quat.x);\n"
 "	out.m_row[2].z=fabs(1-2*quat2.x-2*quat2.y);\n"
 "	out.m_row[2].w = 0.f;\n"
-"\n"
 "	return out;\n"
 "}\n"
-"\n"
-"\n"
 "typedef struct \n"
 "{\n"
 "	float			fx;\n"
@@ -120,7 +96,6 @@ static const char* updateAabbsKernelCL= \
 "	float			fz;\n"
 "	int	uw;\n"
 "} btAABBCL;\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtTranspose(Matrix3x3 m)\n"
 "{\n"
@@ -130,9 +105,6 @@ static const char* updateAabbsKernelCL= \
 "	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
 "	return out;\n"
 "}\n"
-"\n"
-"\n"
-"\n"
 "__inline\n"
 "Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)\n"
 "{\n"
@@ -153,8 +125,6 @@ static const char* updateAabbsKernelCL= \
 "	}\n"
 "	return ans;\n"
 "}\n"
-"\n"
-"\n"
 "__kernel void initializeGpuAabbsFull(  const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)\n"
 "{\n"
 "	int nodeID = get_global_id(0);\n"
--- a/test/OpenCL/KernelLaunch/main.cpp
+++ b/test/OpenCL/KernelLaunch/main.cpp
@@ -0,0 +1,221 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it freely,
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include <stdio.h>
+#include <string.h>
+
+#include "Bullet3Common/b3Vector3.h"
+
+typedef b3Vector3 b3Float4;
+typedef  struct b3Contact4Data b3Contact4Data_t;
+struct b3Contact4Data
+{
+	b3Float4	m_worldPos[4];
+	b3Float4	m_localPosA[4];
+	b3Float4	m_localPosB[4];
+	b3Float4	m_worldNormal;	//	w: m_nPoints
+	unsigned short  m_restituitionCoeffCmp;
+	unsigned short  m_frictionCoeffCmp;
+	int m_batchIdx;
+	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr
+	int m_bodyBPtrAndSignBit;
+	int	m_childIndexA;
+	int	m_childIndexB;
+	int m_unused1;
+	int m_unused2;
+
+};
+
+
+#define MSTRINGIFY(A) #A
+
+static const char* s_testKernelString= MSTRINGIFY(
+
+struct MyTest
+{
+	int bla;
+};
+
+typedef float4	b3Float4;
+typedef  struct b3Contact4Data b3Contact4Data_t;
+struct b3Contact4Data
+{
+	b3Float4	m_worldPos[4];
+	b3Float4	m_localPosA[4];
+	b3Float4	m_localPosB[4];
+	b3Float4	m_worldNormal;	//	w: m_nPoints
+	unsigned short  m_restituitionCoeffCmp;
+	unsigned short  m_frictionCoeffCmp;
+	int m_batchIdx;
+	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr
+	int m_bodyBPtrAndSignBit;
+	int	m_childIndexA;
+	int	m_childIndexB;
+	int m_unused1;
+	int m_unused2;
+
+};
+inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)
+{
+	return (int)contact->m_worldNormal.w;
+};
+inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)
+{
+	contact->m_worldNormal.w = (float)numPoints;
+};
+
+typedef volatile __global int* my_counter32_t;
+
+
+__kernel void   testKernel( __global int* testData, __global b3Contact4Data_t* contactData, my_counter32_t numElements)
+{
+	int id = get_local_id(0);
+	int sz = sizeof(b3Contact4Data_t);
+	testData[id]=sz;
+
+	__private b3Contact4Data_t tmp;
+	if (id==0)
+	{
+		tmp = contactData[1];
+		contactData[1] = contactData[0];
+		contactData[0] = tmp;
+	}
+}
+
+
+
+);
+
+
+
+#include "Bullet3Common/b3Logging.h"
+
+
+void myprintf(const char* msg)
+{
+	//OutputDebugStringA(msg);
+	printf(msg);
+}
+
+int main(int argc, char* argv[])
+{
+	b3SetCustomPrintfFunc(myprintf);
+	//b3SetCustomWarningMessageFunc(myprintf);
+	//b3SetCustomErrorMessageFunc(myprintf);
+
+	b3Printf("test b3Printf\n");
+	b3Warning("test warning\n");
+	b3Error("test error\n");
+
+	int ciErrNum = 0;
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+	const char* vendorSDK = b3OpenCLUtils::getSdkVendorName();
+
+	b3Printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+	int numPlatforms = b3OpenCLUtils::getNumPlatforms();
+	b3Printf("Num Platforms = %d\n", numPlatforms);
+
+	for (int i=0;i<numPlatforms;i++)
+	{
+		cl_platform_id platform = b3OpenCLUtils::getPlatform(i);
+		b3OpenCLPlatformInfo platformInfo;
+		b3OpenCLUtils::getPlatformInfo(platform,&platformInfo);
+		b3Printf("--------------------------------\n");
+		b3Printf("Platform info for platform nr %d:\n",i);
+		b3Printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+		b3Printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+		b3Printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+
+		cl_context context = b3OpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
+		if (context)
+		{
+			int numDevices = b3OpenCLUtils::getNumDevices(context);
+			b3Printf("Num Devices = %d\n", numDevices);
+			for (int j=0;j<numDevices;j++)
+			{
+				cl_device_id dev = b3OpenCLUtils::getDevice(context,j);
+				b3OpenCLDeviceInfo devInfo;
+				b3OpenCLUtils::getDeviceInfo(dev,&devInfo);
+				b3OpenCLUtils::printDeviceInfo(dev);
+
+				int errNum;
+
+				cl_command_queue queue = clCreateCommandQueue(context, dev, 0, &errNum);
+
+				
+				cl_program pairBenchProg=0;
+
+				cl_kernel testKernel = b3OpenCLUtils::compileCLKernelFromString(context,dev,s_testKernelString,"testKernel",&errNum,pairBenchProg);
+				if (testKernel)
+				{
+					printf("kernel compiled ok\n");
+
+					int numWorkItems = 64;
+					b3OpenCLArray<int> deviceElements(context,queue);
+					b3OpenCLArray<int> atomicCounter(context,queue);
+					b3OpenCLArray<b3Contact4Data> deviceContacts(context,queue);
+					b3AlignedObjectArray<b3Contact4Data> hostContacts;
+					
+					b3Contact4Data tmp;
+					int sz = sizeof(b3Contact4Data);
+					memset(&tmp,1,sz);
+					deviceContacts.push_back(tmp);
+					b3Contact4Data tmp2 = tmp;
+					memset(&tmp,2,sz);
+					deviceContacts.push_back(tmp);
+					b3Contact4Data tmp3 = tmp;
+					
+
+					atomicCounter.push_back(0);
+					deviceElements.resize(numWorkItems);
+					b3LauncherCL run(queue,testKernel);
+					run.setBuffer(deviceElements.getBufferCL());
+					run.setBuffer(deviceContacts.getBufferCL());
+					run.setBuffer(atomicCounter.getBufferCL());
+					
+					run.launch1D(numWorkItems);
+
+					b3AlignedObjectArray<int> hostElements;
+					deviceElements.copyToHost(hostElements);
+					deviceContacts.copyToHost(hostContacts);
+					tmp2 = hostContacts[0];
+					tmp3 = hostContacts[1];
+					
+
+					printf("...\n");
+
+				} else
+				{
+					printf("kernel failed to compile\n");
+				}
+
+				
+
+			}
+		}
+
+		clReleaseContext(context);
+	}
+
+	b3Printf("\npress <Enter>\n");
+	getchar();
+	return 0;
+}
--- a/test/OpenCL/KernelLaunch/premake4.lua
+++ b/test/OpenCL/KernelLaunch/premake4.lua
@@ -0,0 +1,33 @@
+function createProject(vendor)
+	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("Test_OpenCL_kernel_launch_" .. vendor)
+
+		initOpenCL(vendor)
+	
+		language "C++"
+				
+		
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		includedirs {"../../../src"}
+		
+		files {
+			"main.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp",
+			"../../../src/Bullet3Common/b3AlignedAllocator.cpp",
+			"../../../src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h",
+			"../../../src/Bullet3Common/b3Logging.cpp",
+		}
+		
+	end
+end
+createProject("clew")	
+createProject("Apple")
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")