Code-style consistency improvement:

Apply clang-format-all.sh using the _clang-format file through all the cpp/.h files. make sure not to apply it to certain serialization structures, since some parser expects the * as part of the name, instead of type. This commit contains no other changes aside from adding and applying clang-format-all.sh
2018-09-23 14:17:31 -07:00
parent b73b05e9fb
commit ab8f16961e
1773 changed files with 1081087 additions and 474249 deletions
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuBroadphaseInterface.h
@@ -12,33 +12,31 @@
 class b3GpuBroadphaseInterface
 {
 public:
-
-	typedef class b3GpuBroadphaseInterface* (CreateFunc)(cl_context ctx,cl_device_id device, cl_command_queue  q);
+	typedef class b3GpuBroadphaseInterface*(CreateFunc)(cl_context ctx, cl_device_id device, cl_command_queue q);

 	virtual ~b3GpuBroadphaseInterface()
 	{
 	}

-	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
-	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)=0;
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask) = 0;

-	virtual void  calculateOverlappingPairs(int maxPairs)=0;
-	virtual void  calculateOverlappingPairsHost(int maxPairs)=0;
+	virtual void calculateOverlappingPairs(int maxPairs) = 0;
+	virtual void calculateOverlappingPairsHost(int maxPairs) = 0;

 	//call writeAabbsToGpu after done making all changes (createProxy etc)
-	virtual void writeAabbsToGpu()=0;
+	virtual void writeAabbsToGpu() = 0;

-	virtual cl_mem	getAabbBufferWS()=0;
-	virtual int	getNumOverlap()=0;
-	virtual cl_mem	getOverlappingPairBuffer()=0;
+	virtual cl_mem getAabbBufferWS() = 0;
+	virtual int getNumOverlap() = 0;
+	virtual cl_mem getOverlappingPairBuffer() = 0;
+
+	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() = 0;
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() = 0;

-	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU()=0;
-	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU()=0;
-	
 	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() = 0;
 	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() = 0;
 	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() = 0;
-
 };

-#endif //B3_GPU_BROADPHASE_INTERFACE_H
+#endif  //B3_GPU_BROADPHASE_INTERFACE_H
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.cpp
@@ -5,12 +5,9 @@
 #include "kernels/sapKernels.h"
 //#include "kernels/gridBroadphase.cl"

-
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"

-
-
 #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
 #define B3_GRID_BROADPHASE_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphase.cl"

@@ -21,31 +18,25 @@ cl_kernel kFindOverlappingPairs;
 cl_kernel m_copyAabbsKernel;
 cl_kernel m_sap2Kernel;

-
-
-
-
 //int maxPairsPerBody = 64;
-int maxBodiesPerCell = 256;//??
+int maxBodiesPerCell = 256;  //??

-b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q )
-:m_context(ctx),
-m_device(device),
-m_queue(q),
-m_allAabbsGPU1(ctx,q),
-m_smallAabbsMappingGPU(ctx,q),
-m_largeAabbsMappingGPU(ctx,q),
-m_gpuPairs(ctx,q),
+b3GpuGridBroadphase::b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q)
+	: m_context(ctx),
+	  m_device(device),
+	  m_queue(q),
+	  m_allAabbsGPU1(ctx, q),
+	  m_smallAabbsMappingGPU(ctx, q),
+	  m_largeAabbsMappingGPU(ctx, q),
+	  m_gpuPairs(ctx, q),

-m_hashGpu(ctx,q),
+	  m_hashGpu(ctx, q),

-m_cellStartGpu(ctx,q),
-m_paramsGPU(ctx,q)
+	  m_cellStartGpu(ctx, q),
+	  m_paramsGPU(ctx, q)
 {
-
-	
-	b3Vector3 gridSize = b3MakeVector3(3,3,3);
-	b3Vector3 invGridSize = b3MakeVector3(1.f/gridSize[0],1.f/gridSize[1],1.f/gridSize[2]);
+	b3Vector3 gridSize = b3MakeVector3(3, 3, 3);
+	b3Vector3 invGridSize = b3MakeVector3(1.f / gridSize[0], 1.f / gridSize[1], 1.f / gridSize[2]);

 	m_paramsCPU.m_gridSize[0] = 128;
 	m_paramsCPU.m_gridSize[1] = 128;
@@ -58,92 +49,79 @@ m_paramsGPU(ctx,q)
 	m_paramsCPU.m_invCellSize[3] = 0.f;
 	m_paramsGPU.push_back(m_paramsCPU);

-	cl_int errNum=0;
+	cl_int errNum = 0;

 	{
 		const char* sapSrc = sapCL;
-		cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
-		b3Assert(errNum==CL_SUCCESS);
-		m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
-		m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
-		b3Assert(errNum==CL_SUCCESS);
+		cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
+		b3Assert(errNum == CL_SUCCESS);
+		m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
+		m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
+		b3Assert(errNum == CL_SUCCESS);
 	}

 	{
-		
-		cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,gridBroadphaseCL,&errNum,"",B3_GRID_BROADPHASE_PATH);
-		b3Assert(errNum==CL_SUCCESS);
+		cl_program gridProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, gridBroadphaseCL, &errNum, "", B3_GRID_BROADPHASE_PATH);
+		b3Assert(errNum == CL_SUCCESS);

-		kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kCalcHashAABB",&errNum,gridProg);
-		b3Assert(errNum==CL_SUCCESS);
-	
-		kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kClearCellStart",&errNum,gridProg);
-		b3Assert(errNum==CL_SUCCESS);
+		kCalcHashAABB = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kCalcHashAABB", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);

-		kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindCellStart",&errNum,gridProg);
-		b3Assert(errNum==CL_SUCCESS);
+		kClearCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kClearCellStart", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);

-	
-		kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,gridBroadphaseCL, "kFindOverlappingPairs",&errNum,gridProg);
-		b3Assert(errNum==CL_SUCCESS);
+		kFindCellStart = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindCellStart", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);

-		
-		
-		
+		kFindOverlappingPairs = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, gridBroadphaseCL, "kFindOverlappingPairs", &errNum, gridProg);
+		b3Assert(errNum == CL_SUCCESS);
 	}

-	m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
-
+	m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
 }
 b3GpuGridBroadphase::~b3GpuGridBroadphase()
 {
-	clReleaseKernel( kCalcHashAABB);
-	clReleaseKernel( kClearCellStart);
-	clReleaseKernel( kFindCellStart);
-	clReleaseKernel( kFindOverlappingPairs);
-	clReleaseKernel( m_sap2Kernel);
-	clReleaseKernel( m_copyAabbsKernel);
-	
-	
-	
+	clReleaseKernel(kCalcHashAABB);
+	clReleaseKernel(kClearCellStart);
+	clReleaseKernel(kFindCellStart);
+	clReleaseKernel(kFindOverlappingPairs);
+	clReleaseKernel(m_sap2Kernel);
+	clReleaseKernel(m_copyAabbsKernel);
+
 	delete m_sorter;
 }

-
-
-void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
+void b3GpuGridBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
 {
 	b3SapAabb aabb;
 	aabb.m_minVec = aabbMin;
 	aabb.m_maxVec = aabbMax;
 	aabb.m_minIndices[3] = userPtr;
-	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;
 	m_smallAabbsMappingCPU.push_back(m_allAabbsCPU1.size());

 	m_allAabbsCPU1.push_back(aabb);
-
 }
-void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask)
+void b3GpuGridBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
 {
 	b3SapAabb aabb;
 	aabb.m_minVec = aabbMin;
 	aabb.m_maxVec = aabbMax;
 	aabb.m_minIndices[3] = userPtr;
-	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();//NOT userPtr;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU1.size();  //NOT userPtr;
 	m_largeAabbsMappingCPU.push_back(m_allAabbsCPU1.size());

 	m_allAabbsCPU1.push_back(aabb);
 }

-void  b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
+void b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
 {
 	B3_PROFILE("b3GpuGridBroadphase::calculateOverlappingPairs");
-	

 	if (0)
 	{
 		calculateOverlappingPairsHost(maxPairs);
-	/*
+		/*
 		b3AlignedObjectArray<b3Int4> cpuPairs;
 		m_gpuPairs.copyToHost(cpuPairs);
 		printf("host m_gpuPairs.size()=%d\n",m_gpuPairs.size());
@@ -154,57 +132,50 @@ void  b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
 		*/
 		return;
 	}
-	
-	

-
-	
 	int numSmallAabbs = m_smallAabbsMappingGPU.size();

-	b3OpenCLArray<int> pairCount(m_context,m_queue);
+	b3OpenCLArray<int> pairCount(m_context, m_queue);
 	pairCount.push_back(0);
-	m_gpuPairs.resize(maxPairs);//numSmallAabbs*maxPairsPerBody);
+	m_gpuPairs.resize(maxPairs);  //numSmallAabbs*maxPairsPerBody);

 	{
 		int numLargeAabbs = m_largeAabbsMappingGPU.size();
 		if (numLargeAabbs && numSmallAabbs)
 		{
 			B3_PROFILE("sap2Kernel");
-			b3BufferInfoCL bInfo[] = { 
-				b3BufferInfoCL( m_allAabbsGPU1.getBufferCL() ),
-				b3BufferInfoCL( m_largeAabbsMappingGPU.getBufferCL() ),
-				b3BufferInfoCL( m_smallAabbsMappingGPU.getBufferCL() ), 
-				b3BufferInfoCL( m_gpuPairs.getBufferCL() ), 
+			b3BufferInfoCL bInfo[] = {
+				b3BufferInfoCL(m_allAabbsGPU1.getBufferCL()),
+				b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
+				b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
+				b3BufferInfoCL(m_gpuPairs.getBufferCL()),
 				b3BufferInfoCL(pairCount.getBufferCL())};
-			b3LauncherCL launcher(m_queue, m_sap2Kernel,"m_sap2Kernel");
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-			launcher.setConst(   numLargeAabbs  );
-			launcher.setConst( numSmallAabbs);
-			launcher.setConst( 0  );//axis is not used
-			launcher.setConst( maxPairs  );
-	//@todo: use actual maximum work item sizes of the device instead of hardcoded values
-			launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
-                
+			b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(numLargeAabbs);
+			launcher.setConst(numSmallAabbs);
+			launcher.setConst(0);  //axis is not used
+			launcher.setConst(maxPairs);
+			//@todo: use actual maximum work item sizes of the device instead of hardcoded values
+			launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
+
 			int numPairs = pairCount.at(0);
-			
-			if (numPairs >maxPairs)
+
+			if (numPairs > maxPairs)
 			{
 				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
-				numPairs =maxPairs;
+				numPairs = maxPairs;
 			}
 		}
 	}

-
-
-
 	if (numSmallAabbs)
 	{
 		B3_PROFILE("gridKernel");
 		m_hashGpu.resize(numSmallAabbs);
 		{
 			B3_PROFILE("kCalcHashAABB");
-			b3LauncherCL launch(m_queue,kCalcHashAABB,"kCalcHashAABB");
+			b3LauncherCL launch(m_queue, kCalcHashAABB, "kCalcHashAABB");
 			launch.setConst(numSmallAabbs);
 			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
 			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
@@ -214,117 +185,104 @@ void  b3GpuGridBroadphase::calculateOverlappingPairs(int maxPairs)
 		}

 		m_sorter->execute(m_hashGpu);
-		
-		int numCells = this->m_paramsCPU.m_gridSize[0]*this->m_paramsCPU.m_gridSize[1]*this->m_paramsCPU.m_gridSize[2];
+
+		int numCells = this->m_paramsCPU.m_gridSize[0] * this->m_paramsCPU.m_gridSize[1] * this->m_paramsCPU.m_gridSize[2];
 		m_cellStartGpu.resize(numCells);
 		//b3AlignedObjectArray<int >			cellStartCpu;
-				
-		
+
 		{
 			B3_PROFILE("kClearCellStart");
-			b3LauncherCL launch(m_queue,kClearCellStart,"kClearCellStart");
+			b3LauncherCL launch(m_queue, kClearCellStart, "kClearCellStart");
 			launch.setConst(numCells);
 			launch.setBuffer(m_cellStartGpu.getBufferCL());
 			launch.launch1D(numCells);
 			//m_cellStartGpu.copyToHost(cellStartCpu);
 			//printf("??\n");
-
 		}

-
 		{
 			B3_PROFILE("kFindCellStart");
-			b3LauncherCL launch(m_queue,kFindCellStart,"kFindCellStart");
+			b3LauncherCL launch(m_queue, kFindCellStart, "kFindCellStart");
 			launch.setConst(numSmallAabbs);
 			launch.setBuffer(m_hashGpu.getBufferCL());
 			launch.setBuffer(m_cellStartGpu.getBufferCL());
 			launch.launch1D(numSmallAabbs);
 			//m_cellStartGpu.copyToHost(cellStartCpu);
 			//printf("??\n");
-
 		}
-		
+
 		{
 			B3_PROFILE("kFindOverlappingPairs");
-			
-			
-			b3LauncherCL launch(m_queue,kFindOverlappingPairs,"kFindOverlappingPairs");
+
+			b3LauncherCL launch(m_queue, kFindOverlappingPairs, "kFindOverlappingPairs");
 			launch.setConst(numSmallAabbs);
 			launch.setBuffer(m_allAabbsGPU1.getBufferCL());
 			launch.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
 			launch.setBuffer(m_hashGpu.getBufferCL());
 			launch.setBuffer(m_cellStartGpu.getBufferCL());
-			
+
 			launch.setBuffer(m_paramsGPU.getBufferCL());
 			//launch.setBuffer(0);
 			launch.setBuffer(pairCount.getBufferCL());
 			launch.setBuffer(m_gpuPairs.getBufferCL());
-			
+
 			launch.setConst(maxPairs);
 			launch.launch1D(numSmallAabbs);
-			

 			int numPairs = pairCount.at(0);
-			if (numPairs >maxPairs)
+			if (numPairs > maxPairs)
 			{
 				b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
-				numPairs =maxPairs;
+				numPairs = maxPairs;
 			}
-			
+
 			m_gpuPairs.resize(numPairs);
-	
+
 			if (0)
 			{
 				b3AlignedObjectArray<b3Int4> pairsCpu;
 				m_gpuPairs.copyToHost(pairsCpu);

 				int sz = m_gpuPairs.size();
-				printf("m_gpuPairs.size()=%d\n",sz);
-				for (int i=0;i<m_gpuPairs.size();i++)
+				printf("m_gpuPairs.size()=%d\n", sz);
+				for (int i = 0; i < m_gpuPairs.size(); i++)
 				{
-					printf("pair %d = %d,%d\n",i,pairsCpu[i].x,pairsCpu[i].y);
+					printf("pair %d = %d,%d\n", i, pairsCpu[i].x, pairsCpu[i].y);
 				}

 				printf("?!?\n");
 			}
-			
 		}
-	
-
 	}

-	
-
-
-
 	//calculateOverlappingPairsHost(maxPairs);
 }
-void  b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
+void b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
 {
-
 	m_hostPairs.resize(0);
 	m_allAabbsGPU1.copyToHost(m_allAabbsCPU1);
-	for (int i=0;i<m_allAabbsCPU1.size();i++)
+	for (int i = 0; i < m_allAabbsCPU1.size(); i++)
 	{
-		for (int j=i+1;j<m_allAabbsCPU1.size();j++)
+		for (int j = i + 1; j < m_allAabbsCPU1.size(); j++)
 		{
 			if (b3TestAabbAgainstAabb2(m_allAabbsCPU1[i].m_minVec, m_allAabbsCPU1[i].m_maxVec,
-				m_allAabbsCPU1[j].m_minVec,m_allAabbsCPU1[j].m_maxVec))
+									   m_allAabbsCPU1[j].m_minVec, m_allAabbsCPU1[j].m_maxVec))
 			{
 				b3Int4 pair;
 				int a = m_allAabbsCPU1[j].m_minIndices[3];
 				int b = m_allAabbsCPU1[i].m_minIndices[3];
-				if (a<=b)
+				if (a <= b)
 				{
-					pair.x = a; 
-					pair.y = b;//store the original index in the unsorted aabb array
-				} else
+					pair.x = a;
+					pair.y = b;  //store the original index in the unsorted aabb array
+				}
+				else
 				{
 					pair.x = b;
-					pair.y = a;//store the original index in the unsorted aabb array
+					pair.y = a;  //store the original index in the unsorted aabb array
 				}
-					
-				if (m_hostPairs.size()<maxPairs)
+
+				if (m_hostPairs.size() < maxPairs)
 				{
 					m_hostPairs.push_back(pair);
 				}
@@ -332,40 +290,36 @@ void  b3GpuGridBroadphase::calculateOverlappingPairsHost(int maxPairs)
 		}
 	}

-
 	m_gpuPairs.copyFromHost(m_hostPairs);
-
-
 }

-	//call writeAabbsToGpu after done making all changes (createProxy etc)
+//call writeAabbsToGpu after done making all changes (createProxy etc)
 void b3GpuGridBroadphase::writeAabbsToGpu()
 {
 	m_allAabbsGPU1.copyFromHost(m_allAabbsCPU1);
 	m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
 	m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
-
 }

-cl_mem	b3GpuGridBroadphase::getAabbBufferWS()
+cl_mem b3GpuGridBroadphase::getAabbBufferWS()
 {
 	return this->m_allAabbsGPU1.getBufferCL();
 }
-int	b3GpuGridBroadphase::getNumOverlap()
+int b3GpuGridBroadphase::getNumOverlap()
 {
 	return m_gpuPairs.size();
 }
-cl_mem	b3GpuGridBroadphase::getOverlappingPairBuffer()
+cl_mem b3GpuGridBroadphase::getOverlappingPairBuffer()
 {
 	return m_gpuPairs.getBufferCL();
 }

-b3OpenCLArray<b3SapAabb>&	b3GpuGridBroadphase::getAllAabbsGPU()
+b3OpenCLArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsGPU()
 {
 	return m_allAabbsGPU1;
 }

-b3AlignedObjectArray<b3SapAabb>&	b3GpuGridBroadphase::getAllAabbsCPU()
+b3AlignedObjectArray<b3SapAabb>& b3GpuGridBroadphase::getAllAabbsCPU()
 {
 	return m_allAabbsCPU1;
 }
@@ -382,4 +336,3 @@ b3OpenCLArray<int>& b3GpuGridBroadphase::getLargeAabbIndicesGPU()
 {
 	return m_largeAabbsMappingGPU;
 }
-
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuGridBroadphase.h
@@ -6,83 +6,75 @@

 struct b3ParamsGridBroadphaseCL
 {
-
 	float m_invCellSize[4];
-	int   m_gridSize[4];
+	int m_gridSize[4];

-	int	getMaxBodiesPerCell() const
+	int getMaxBodiesPerCell() const
 	{
 		return m_gridSize[3];
 	}

-	void setMaxBodiesPerCell(int maxOverlap) 
+	void setMaxBodiesPerCell(int maxOverlap)
 	{
 		m_gridSize[3] = maxOverlap;
 	}
 };

-
 class b3GpuGridBroadphase : public b3GpuBroadphaseInterface
 {
 protected:
-	cl_context				m_context;
-	cl_device_id			m_device;
-	cl_command_queue		m_queue;
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;

-	b3OpenCLArray<b3SapAabb>	m_allAabbsGPU1;
-	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU1;
+	b3OpenCLArray<b3SapAabb> m_allAabbsGPU1;
+	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU1;

-	b3OpenCLArray<int>	m_smallAabbsMappingGPU;
+	b3OpenCLArray<int> m_smallAabbsMappingGPU;
 	b3AlignedObjectArray<int> m_smallAabbsMappingCPU;

-	b3OpenCLArray<int>	m_largeAabbsMappingGPU;
+	b3OpenCLArray<int> m_largeAabbsMappingGPU;
 	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;

 	b3AlignedObjectArray<b3Int4> m_hostPairs;
-	b3OpenCLArray<b3Int4>			m_gpuPairs;
+	b3OpenCLArray<b3Int4> m_gpuPairs;

-	b3OpenCLArray<b3SortData>			m_hashGpu;
-	b3OpenCLArray<int>			m_cellStartGpu;
-	
+	b3OpenCLArray<b3SortData> m_hashGpu;
+	b3OpenCLArray<int> m_cellStartGpu;

-	b3ParamsGridBroadphaseCL		m_paramsCPU;
-	b3OpenCLArray<b3ParamsGridBroadphaseCL>		m_paramsGPU;
+	b3ParamsGridBroadphaseCL m_paramsCPU;
+	b3OpenCLArray<b3ParamsGridBroadphaseCL> m_paramsGPU;

-	class b3RadixSort32CL*			m_sorter;
+	class b3RadixSort32CL* m_sorter;

 public:
-
-	b3GpuGridBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	b3GpuGridBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q);
 	virtual ~b3GpuGridBroadphase();

-	static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	static b3GpuBroadphaseInterface* CreateFunc(cl_context ctx, cl_device_id device, cl_command_queue q)
 	{
-		return new b3GpuGridBroadphase(ctx,device,q);
+		return new b3GpuGridBroadphase(ctx, device, q);
 	}

-	
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);

-
-	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
-	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
-
-	virtual void  calculateOverlappingPairs(int maxPairs);
-	virtual void  calculateOverlappingPairsHost(int maxPairs);
+	virtual void calculateOverlappingPairs(int maxPairs);
+	virtual void calculateOverlappingPairsHost(int maxPairs);

 	//call writeAabbsToGpu after done making all changes (createProxy etc)
 	virtual void writeAabbsToGpu();

-	virtual cl_mem	getAabbBufferWS();
-	virtual int	getNumOverlap();
-	virtual cl_mem	getOverlappingPairBuffer();
+	virtual cl_mem getAabbBufferWS();
+	virtual int getNumOverlap();
+	virtual cl_mem getOverlappingPairBuffer();
+
+	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU();
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU();

-	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU();
-	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU();
-	
 	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
 	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
 	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
-
 };

-#endif //B3_GPU_GRID_BROADPHASE_H
+#endif  //B3_GPU_GRID_BROADPHASE_H
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.cpp
@@ -16,177 +16,174 @@ subject to the following restrictions:

 #include "b3GpuParallelLinearBvh.h"

-b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) :
-	m_queue(queue),
-	m_radixSorter(context, device, queue),
-	
-	m_rootNodeIndex(context, queue),
-	m_maxDistanceFromRoot(context, queue),
-	m_temp(context, queue),
-	
-	m_internalNodeAabbs(context, queue),
-	m_internalNodeLeafIndexRanges(context, queue),
-	m_internalNodeChildNodes(context, queue),
-	m_internalNodeParentNodes(context, queue),
-	
-	m_commonPrefixes(context, queue),
-	m_commonPrefixLengths(context, queue),
-	m_distanceFromRoot(context, queue),
-	
-	m_leafNodeParentNodes(context, queue),
-	m_mortonCodesAndAabbIndicies(context, queue),
-	m_mergedAabb(context, queue),
-	m_leafNodeAabbs(context, queue),
-	
-	m_largeAabbs(context, queue)
+b3GpuParallelLinearBvh::b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue) : m_queue(queue),
+																												  m_radixSorter(context, device, queue),
+
+																												  m_rootNodeIndex(context, queue),
+																												  m_maxDistanceFromRoot(context, queue),
+																												  m_temp(context, queue),
+
+																												  m_internalNodeAabbs(context, queue),
+																												  m_internalNodeLeafIndexRanges(context, queue),
+																												  m_internalNodeChildNodes(context, queue),
+																												  m_internalNodeParentNodes(context, queue),
+
+																												  m_commonPrefixes(context, queue),
+																												  m_commonPrefixLengths(context, queue),
+																												  m_distanceFromRoot(context, queue),
+
+																												  m_leafNodeParentNodes(context, queue),
+																												  m_mortonCodesAndAabbIndicies(context, queue),
+																												  m_mergedAabb(context, queue),
+																												  m_leafNodeAabbs(context, queue),
+
+																												  m_largeAabbs(context, queue)
 {
 	m_rootNodeIndex.resize(1);
 	m_maxDistanceFromRoot.resize(1);
 	m_temp.resize(1);
-	
+
 	//
 	const char CL_PROGRAM_PATH[] = "src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvh.cl";
-	
-	const char* kernelSource = parallelLinearBvhCL;	//parallelLinearBvhCL.h
+
+	const char* kernelSource = parallelLinearBvhCL;  //parallelLinearBvhCL.h
 	cl_int error;
 	char* additionalMacros = 0;
 	m_parallelLinearBvhProgram = b3OpenCLUtils::compileCLProgramFromString(context, device, kernelSource, &error, additionalMacros, CL_PROGRAM_PATH);
 	b3Assert(m_parallelLinearBvhProgram);
-	
-	m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+	m_separateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "separateAabbs", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_separateAabbsKernel);
-	m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_findAllNodesMergedAabbKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findAllNodesMergedAabb", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_findAllNodesMergedAabbKernel);
-	m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_assignMortonCodesAndAabbIndiciesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "assignMortonCodesAndAabbIndicies", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_assignMortonCodesAndAabbIndiciesKernel);
-	
-	m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+	m_computeAdjacentPairCommonPrefixKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "computeAdjacentPairCommonPrefix", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_computeAdjacentPairCommonPrefixKernel);
-	m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_buildBinaryRadixTreeLeafNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeLeafNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_buildBinaryRadixTreeLeafNodesKernel);
-	m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_buildBinaryRadixTreeInternalNodesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeInternalNodes", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_buildBinaryRadixTreeInternalNodesKernel);
-	m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_findDistanceFromRootKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findDistanceFromRoot", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_findDistanceFromRootKernel);
-	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_buildBinaryRadixTreeAabbsRecursiveKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "buildBinaryRadixTreeAabbsRecursive", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_buildBinaryRadixTreeAabbsRecursiveKernel);
-	
-	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+	m_findLeafIndexRangesKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "findLeafIndexRanges", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_findLeafIndexRangesKernel);
-	
-	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros );
+
+	m_plbvhCalculateOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhCalculateOverlappingPairs", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_plbvhCalculateOverlappingPairsKernel);
-	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_plbvhRayTraverseKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhRayTraverse", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_plbvhRayTraverseKernel);
-	m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_plbvhLargeAabbAabbTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbAabbTest", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_plbvhLargeAabbAabbTestKernel);
-	m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString( context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros );
+	m_plbvhLargeAabbRayTestKernel = b3OpenCLUtils::compileCLKernelFromString(context, device, kernelSource, "plbvhLargeAabbRayTest", &error, m_parallelLinearBvhProgram, additionalMacros);
 	b3Assert(m_plbvhLargeAabbRayTestKernel);
 }

-b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh() 
+b3GpuParallelLinearBvh::~b3GpuParallelLinearBvh()
 {
 	clReleaseKernel(m_separateAabbsKernel);
 	clReleaseKernel(m_findAllNodesMergedAabbKernel);
 	clReleaseKernel(m_assignMortonCodesAndAabbIndiciesKernel);
-	
+
 	clReleaseKernel(m_computeAdjacentPairCommonPrefixKernel);
 	clReleaseKernel(m_buildBinaryRadixTreeLeafNodesKernel);
 	clReleaseKernel(m_buildBinaryRadixTreeInternalNodesKernel);
 	clReleaseKernel(m_findDistanceFromRootKernel);
 	clReleaseKernel(m_buildBinaryRadixTreeAabbsRecursiveKernel);
-	
+
 	clReleaseKernel(m_findLeafIndexRangesKernel);
-	
+
 	clReleaseKernel(m_plbvhCalculateOverlappingPairsKernel);
 	clReleaseKernel(m_plbvhRayTraverseKernel);
 	clReleaseKernel(m_plbvhLargeAabbAabbTestKernel);
 	clReleaseKernel(m_plbvhLargeAabbRayTestKernel);
-	
+
 	clReleaseProgram(m_parallelLinearBvhProgram);
 }

-void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, 
-									const b3OpenCLArray<int>& largeAabbIndices)
+void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
+								   const b3OpenCLArray<int>& largeAabbIndices)
 {
 	B3_PROFILE("b3ParallelLinearBvh::build()");
-	
+
 	int numLargeAabbs = largeAabbIndices.size();
 	int numSmallAabbs = smallAabbIndices.size();
-	
-	//Since all AABBs(both large and small) are input as a contiguous array, 
+
+	//Since all AABBs(both large and small) are input as a contiguous array,
 	//with 2 additional arrays used to indicate the indices of large and small AABBs,
 	//it is necessary to separate the AABBs so that the large AABBs will not degrade the quality of the BVH.
 	{
 		B3_PROFILE("Separate large and small AABBs");
-		
+
 		m_largeAabbs.resize(numLargeAabbs);
 		m_leafNodeAabbs.resize(numSmallAabbs);
-		
+
 		//Write large AABBs into m_largeAabbs
 		{
-			b3BufferInfoCL bufferInfo[] = 
-			{
-				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
-				b3BufferInfoCL( largeAabbIndices.getBufferCL() ),
-				
-				b3BufferInfoCL( m_largeAabbs.getBufferCL() )
-			};
-			
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
+					b3BufferInfoCL(largeAabbIndices.getBufferCL()),
+
+					b3BufferInfoCL(m_largeAabbs.getBufferCL())};
+
 			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 			launcher.setConst(numLargeAabbs);
-			
+
 			launcher.launch1D(numLargeAabbs);
 		}
-		
+
 		//Write small AABBs into m_leafNodeAabbs
 		{
-			b3BufferInfoCL bufferInfo[] = 
-			{
-				b3BufferInfoCL( worldSpaceAabbs.getBufferCL() ),
-				b3BufferInfoCL( smallAabbIndices.getBufferCL() ),
-				
-				b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() )
-			};
-			
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(worldSpaceAabbs.getBufferCL()),
+					b3BufferInfoCL(smallAabbIndices.getBufferCL()),
+
+					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL())};
+
 			b3LauncherCL launcher(m_queue, m_separateAabbsKernel, "m_separateAabbsKernel");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 			launcher.setConst(numSmallAabbs);
-			
+
 			launcher.launch1D(numSmallAabbs);
 		}
-		
+
 		clFinish(m_queue);
 	}
-	
+
 	//
-	int numLeaves = numSmallAabbs;	//Number of leaves in the BVH == Number of rigid bodies with small AABBs
+	int numLeaves = numSmallAabbs;  //Number of leaves in the BVH == Number of rigid bodies with small AABBs
 	int numInternalNodes = numLeaves - 1;
-	
-	if(numLeaves < 2)
+
+	if (numLeaves < 2)
 	{
 		//Number of leaf nodes is checked in calculateOverlappingPairs() and testRaysAgainstBvhAabbs(),
 		//so it does not matter if numLeaves == 0 and rootNodeIndex == -1
 		int rootNodeIndex = numLeaves - 1;
 		m_rootNodeIndex.copyFromHostPointer(&rootNodeIndex, 1);
-		
+
 		//Since the AABBs need to be rearranged(sorted) for the BVH construction algorithm,
 		//m_mortonCodesAndAabbIndicies.m_value is used to map a sorted AABB index to the unsorted AABB index
 		//instead of directly moving the AABBs. It needs to be set for the ray cast traversal kernel to work.
 		//( m_mortonCodesAndAabbIndicies[].m_value == unsorted index == index of m_leafNodeAabbs )
-		if(numLeaves == 1)
+		if (numLeaves == 1)
 		{
 			b3SortData leaf;
-			leaf.m_value = 0;		//1 leaf so index is always 0; leaf.m_key does not need to be set
-			
+			leaf.m_value = 0;  //1 leaf so index is always 0; leaf.m_key does not need to be set
+
 			m_mortonCodesAndAabbIndicies.resize(1);
 			m_mortonCodesAndAabbIndicies.copyFromHostPointer(&leaf, 1);
 		}
-		
+
 		return;
 	}
-	
+
 	//
 	{
 		m_internalNodeAabbs.resize(numInternalNodes);
@@ -197,37 +194,37 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
 		m_commonPrefixes.resize(numInternalNodes);
 		m_commonPrefixLengths.resize(numInternalNodes);
 		m_distanceFromRoot.resize(numInternalNodes);
-	
+
 		m_leafNodeParentNodes.resize(numLeaves);
 		m_mortonCodesAndAabbIndicies.resize(numLeaves);
 		m_mergedAabb.resize(numLeaves);
 	}
-	
-	//Find the merged AABB of all small AABBs; this is used to define the size of 
+
+	//Find the merged AABB of all small AABBs; this is used to define the size of
 	//each cell in the virtual grid for the next kernel(2^10 cells in each dimension).
 	{
 		B3_PROFILE("Find AABB of merged nodes");
-	
-		m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);	//Need to make a copy since the kernel modifies the array
-			
-		for(int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2; 
-				numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
+
+		m_mergedAabb.copyFromOpenCLArray(m_leafNodeAabbs);  //Need to make a copy since the kernel modifies the array
+
+		for (int numAabbsNeedingMerge = numLeaves; numAabbsNeedingMerge >= 2;
+			 numAabbsNeedingMerge = numAabbsNeedingMerge / 2 + numAabbsNeedingMerge % 2)
 		{
-			b3BufferInfoCL bufferInfo[] = 
-			{
-				b3BufferInfoCL( m_mergedAabb.getBufferCL() )		//Resulting AABB is stored in m_mergedAabb[0]
-			};
-			
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(m_mergedAabb.getBufferCL())  //Resulting AABB is stored in m_mergedAabb[0]
+				};
+
 			b3LauncherCL launcher(m_queue, m_findAllNodesMergedAabbKernel, "m_findAllNodesMergedAabbKernel");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 			launcher.setConst(numAabbsNeedingMerge);
-			
+
 			launcher.launch1D(numAabbsNeedingMerge);
 		}
-		
+
 		clFinish(m_queue);
 	}
-	
+
 	//Insert the center of the AABBs into a virtual grid,
 	//then convert the discrete grid coordinates into a morton code
 	//For each element in m_mortonCodesAndAabbIndicies, set
@@ -235,34 +232,32 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
 	//	m_value == small AABB index
 	{
 		B3_PROFILE("Assign morton codes");
-	
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
-			b3BufferInfoCL( m_mergedAabb.getBufferCL() ),
-			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_mergedAabb.getBufferCL()),
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_assignMortonCodesAndAabbIndiciesKernel, "m_assignMortonCodesAndAabbIndiciesKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(numLeaves);
-		
+
 		launcher.launch1D(numLeaves);
 		clFinish(m_queue);
 	}
-	
+
 	//
 	{
 		B3_PROFILE("Sort leaves by morton codes");
-	
+
 		m_radixSorter.execute(m_mortonCodesAndAabbIndicies);
 		clFinish(m_queue);
 	}
-	
+
 	//
 	constructBinaryRadixTree();
-	
-	
+
 	//Since it is a sorted binary radix tree, each internal node contains a contiguous subset of leaf node indices.
 	//The root node contains leaf node indices in the range [0, numLeafNodes - 1].
 	//The child nodes of each node split their parent's index range into 2 contiguous halves.
@@ -273,17 +268,16 @@ void b3GpuParallelLinearBvh::build(const b3OpenCLArray<b3SapAabb>& worldSpaceAab
 	//This property can be used for optimizing calculateOverlappingPairs(), to avoid testing each AABB pair twice
 	{
 		B3_PROFILE("m_findLeafIndexRangesKernel");
-	
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_findLeafIndexRangesKernel, "m_findLeafIndexRangesKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(numInternalNodes);
-		
+
 		launcher.launch1D(numInternalNodes);
 		clFinish(m_queue);
 	}
@@ -293,285 +287,271 @@ void b3GpuParallelLinearBvh::calculateOverlappingPairs(b3OpenCLArray<b3Int4>& ou
 {
 	int maxPairs = out_overlappingPairs.size();
 	b3OpenCLArray<int>& numPairsGpu = m_temp;
-	
+
 	int reset = 0;
 	numPairsGpu.copyFromHostPointer(&reset, 1);
-	
+
 	//
-	if( m_leafNodeAabbs.size() > 1 )
+	if (m_leafNodeAabbs.size() > 1)
 	{
 		B3_PROFILE("PLBVH small-small AABB test");
-	
+
 		int numQueryAabbs = m_leafNodeAabbs.size();
-		
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
-			
-			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
-			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
-			
-			b3BufferInfoCL( numPairsGpu.getBufferCL() ),
-			b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+
+				b3BufferInfoCL(numPairsGpu.getBufferCL()),
+				b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_plbvhCalculateOverlappingPairsKernel, "m_plbvhCalculateOverlappingPairsKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(maxPairs);
 		launcher.setConst(numQueryAabbs);
-		
+
 		launcher.launch1D(numQueryAabbs);
 		clFinish(m_queue);
 	}
-	
+
 	int numLargeAabbRigids = m_largeAabbs.size();
-	if( numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0 )
+	if (numLargeAabbRigids > 0 && m_leafNodeAabbs.size() > 0)
 	{
 		B3_PROFILE("PLBVH large-small AABB test");
-	
+
 		int numQueryAabbs = m_leafNodeAabbs.size();
-		
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
-			b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
-			
-			b3BufferInfoCL( numPairsGpu.getBufferCL() ),
-			b3BufferInfoCL( out_overlappingPairs.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_largeAabbs.getBufferCL()),
+
+				b3BufferInfoCL(numPairsGpu.getBufferCL()),
+				b3BufferInfoCL(out_overlappingPairs.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbAabbTestKernel, "m_plbvhLargeAabbAabbTestKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(maxPairs);
 		launcher.setConst(numLargeAabbRigids);
 		launcher.setConst(numQueryAabbs);
-		
+
 		launcher.launch1D(numQueryAabbs);
 		clFinish(m_queue);
 	}
-	
-	
+
 	//
 	int numPairs = -1;
 	numPairsGpu.copyToHostPointer(&numPairs, 1);
-	if(numPairs > maxPairs)
+	if (numPairs > maxPairs)
 	{
 		b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
 		numPairs = maxPairs;
 		numPairsGpu.copyFromHostPointer(&maxPairs, 1);
 	}
-	
+
 	out_overlappingPairs.resize(numPairs);
 }

-
-void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, 
-							b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
+void b3GpuParallelLinearBvh::testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
+													 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs)
 {
 	B3_PROFILE("PLBVH testRaysAgainstBvhAabbs()");
-	
+
 	int numRays = rays.size();
 	int maxRayRigidPairs = out_rayRigidPairs.size();
-	
+
 	int reset = 0;
 	out_numRayRigidPairs.copyFromHostPointer(&reset, 1);
-	
+
 	//
-	if( m_leafNodeAabbs.size() > 0 )
+	if (m_leafNodeAabbs.size() > 0)
 	{
 		B3_PROFILE("PLBVH ray test small AABB");
-	
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
-			
-			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeLeafIndexRanges.getBufferCL() ),
-			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
-			
-			b3BufferInfoCL( rays.getBufferCL() ),
-			
-			b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
-			b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeAabbs.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeLeafIndexRanges.getBufferCL()),
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+
+				b3BufferInfoCL(rays.getBufferCL()),
+
+				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
+				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_plbvhRayTraverseKernel, "m_plbvhRayTraverseKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(maxRayRigidPairs);
 		launcher.setConst(numRays);
-		
+
 		launcher.launch1D(numRays);
 		clFinish(m_queue);
 	}
-	
+
 	int numLargeAabbRigids = m_largeAabbs.size();
-	if(numLargeAabbRigids > 0)
+	if (numLargeAabbRigids > 0)
 	{
 		B3_PROFILE("PLBVH ray test large AABB");
-		
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_largeAabbs.getBufferCL() ),
-			b3BufferInfoCL( rays.getBufferCL() ),
-			
-			b3BufferInfoCL( out_numRayRigidPairs.getBufferCL() ),
-			b3BufferInfoCL( out_rayRigidPairs.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_largeAabbs.getBufferCL()),
+				b3BufferInfoCL(rays.getBufferCL()),
+
+				b3BufferInfoCL(out_numRayRigidPairs.getBufferCL()),
+				b3BufferInfoCL(out_rayRigidPairs.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_plbvhLargeAabbRayTestKernel, "m_plbvhLargeAabbRayTestKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(numLargeAabbRigids);
 		launcher.setConst(maxRayRigidPairs);
 		launcher.setConst(numRays);
-		
+
 		launcher.launch1D(numRays);
 		clFinish(m_queue);
 	}
-	
+
 	//
 	int numRayRigidPairs = -1;
 	out_numRayRigidPairs.copyToHostPointer(&numRayRigidPairs, 1);
-	
-	if(numRayRigidPairs > maxRayRigidPairs)
+
+	if (numRayRigidPairs > maxRayRigidPairs)
 		b3Error("Error running out of rayRigid pairs: numRayRigidPairs = %d, maxRayRigidPairs = %d.\n", numRayRigidPairs, maxRayRigidPairs);
-	
 }

 void b3GpuParallelLinearBvh::constructBinaryRadixTree()
 {
 	B3_PROFILE("b3GpuParallelLinearBvh::constructBinaryRadixTree()");
-	
+
 	int numLeaves = m_leafNodeAabbs.size();
 	int numInternalNodes = numLeaves - 1;
-	
+
 	//Each internal node is placed in between 2 leaf nodes.
 	//By using this arrangement and computing the common prefix between
 	//these 2 adjacent leaf nodes, it is possible to quickly construct a binary radix tree.
 	{
 		B3_PROFILE("m_computeAdjacentPairCommonPrefixKernel");
-		
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
-			b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
-			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+				b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
+				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_computeAdjacentPairCommonPrefixKernel, "m_computeAdjacentPairCommonPrefixKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(numInternalNodes);
-		
+
 		launcher.launch1D(numInternalNodes);
 		clFinish(m_queue);
 	}
-	
-	//For each leaf node, select its parent node by 
+
+	//For each leaf node, select its parent node by
 	//comparing the 2 nearest internal nodes and assign child node indices
 	{
 		B3_PROFILE("m_buildBinaryRadixTreeLeafNodesKernel");
-		
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
-			b3BufferInfoCL( m_leafNodeParentNodes.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
+				b3BufferInfoCL(m_leafNodeParentNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeLeafNodesKernel, "m_buildBinaryRadixTreeLeafNodesKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(numLeaves);
-		
+
 		launcher.launch1D(numLeaves);
 		clFinish(m_queue);
 	}
-	
+
 	//For each internal node, perform 2 binary searches among the other internal nodes
 	//to its left and right to find its potential parent nodes and assign child node indices
 	{
 		B3_PROFILE("m_buildBinaryRadixTreeInternalNodesKernel");
-		
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_commonPrefixes.getBufferCL() ),
-			b3BufferInfoCL( m_commonPrefixLengths.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
-			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_commonPrefixes.getBufferCL()),
+				b3BufferInfoCL(m_commonPrefixLengths.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeInternalNodesKernel, "m_buildBinaryRadixTreeInternalNodesKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(numInternalNodes);
-		
+
 		launcher.launch1D(numInternalNodes);
 		clFinish(m_queue);
 	}
-	
+
 	//Find the number of nodes seperating each internal node and the root node
 	//so that the AABBs can be set using the next kernel.
 	//Also determine the maximum number of nodes separating an internal node and the root node.
 	{
 		B3_PROFILE("m_findDistanceFromRootKernel");
-	
-		b3BufferInfoCL bufferInfo[] = 
-		{
-			b3BufferInfoCL( m_rootNodeIndex.getBufferCL() ),
-			b3BufferInfoCL( m_internalNodeParentNodes.getBufferCL() ),
-			b3BufferInfoCL( m_maxDistanceFromRoot.getBufferCL() ),
-			b3BufferInfoCL( m_distanceFromRoot.getBufferCL() )
-		};
-		
+
+		b3BufferInfoCL bufferInfo[] =
+			{
+				b3BufferInfoCL(m_rootNodeIndex.getBufferCL()),
+				b3BufferInfoCL(m_internalNodeParentNodes.getBufferCL()),
+				b3BufferInfoCL(m_maxDistanceFromRoot.getBufferCL()),
+				b3BufferInfoCL(m_distanceFromRoot.getBufferCL())};
+
 		b3LauncherCL launcher(m_queue, m_findDistanceFromRootKernel, "m_findDistanceFromRootKernel");
-		launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+		launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(numInternalNodes);
-		
+
 		launcher.launch1D(numInternalNodes);
 		clFinish(m_queue);
 	}
-	
+
 	//Starting from the internal nodes nearest to the leaf nodes, recursively move up
 	//the tree towards the root to set the AABBs of each internal node; each internal node
 	//checks its children and merges their AABBs
 	{
 		B3_PROFILE("m_buildBinaryRadixTreeAabbsRecursiveKernel");
-		
+
 		int maxDistanceFromRoot = -1;
 		{
 			B3_PROFILE("copy maxDistanceFromRoot to CPU");
 			m_maxDistanceFromRoot.copyToHostPointer(&maxDistanceFromRoot, 1);
 			clFinish(m_queue);
 		}
-		
-		for(int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
+
+		for (int distanceFromRoot = maxDistanceFromRoot; distanceFromRoot >= 0; --distanceFromRoot)
 		{
-			b3BufferInfoCL bufferInfo[] = 
-			{
-				b3BufferInfoCL( m_distanceFromRoot.getBufferCL() ),
-				b3BufferInfoCL( m_mortonCodesAndAabbIndicies.getBufferCL() ),
-				b3BufferInfoCL( m_internalNodeChildNodes.getBufferCL() ),
-				b3BufferInfoCL( m_leafNodeAabbs.getBufferCL() ),
-				b3BufferInfoCL( m_internalNodeAabbs.getBufferCL() )
-			};
-			
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(m_distanceFromRoot.getBufferCL()),
+					b3BufferInfoCL(m_mortonCodesAndAabbIndicies.getBufferCL()),
+					b3BufferInfoCL(m_internalNodeChildNodes.getBufferCL()),
+					b3BufferInfoCL(m_leafNodeAabbs.getBufferCL()),
+					b3BufferInfoCL(m_internalNodeAabbs.getBufferCL())};
+
 			b3LauncherCL launcher(m_queue, m_buildBinaryRadixTreeAabbsRecursiveKernel, "m_buildBinaryRadixTreeAabbsRecursiveKernel");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 			launcher.setConst(maxDistanceFromRoot);
 			launcher.setConst(distanceFromRoot);
 			launcher.setConst(numInternalNodes);
-			
+
 			//It may seem inefficent to launch a thread for each internal node when a
 			//much smaller number of nodes is actually processed, but this is actually
-			//faster than determining the exact nodes that are ready to merge their child AABBs. 
+			//faster than determining the exact nodes that are ready to merge their child AABBs.
 			launcher.launch1D(numInternalNodes);
 		}
-		
+
 		clFinish(m_queue);
 	}
 }
-
-	
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvh.h
@@ -37,10 +37,10 @@ subject to the following restrictions:
 ///"Maximizing Parallelism in the Construction of BVHs, Octrees, and k-d trees" [Karras 2012] \n
 ///@par
 ///The basic algorithm for building the BVH as presented in [Lauterbach et al. 2009] consists of 4 stages:
-/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid) 
+/// - [fully parallel] Assign morton codes for each AABB using its center (after quantizing the AABB centers into a virtual grid)
 /// - [fully parallel] Sort morton codes
-/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH) 
-/// - [somewhat parallel] Set internal node AABBs 
+/// - [somewhat parallel] Build binary radix tree (assign parent/child pointers for internal nodes of the BVH)
+/// - [somewhat parallel] Set internal node AABBs
 ///@par
 ///[Karras 2012] improves on the algorithm by introducing fully parallel methods for the last 2 stages.
 ///The BVH implementation here shares many concepts with [Karras 2012], but a different method is used for constructing the tree.
@@ -49,75 +49,75 @@ subject to the following restrictions:
 class b3GpuParallelLinearBvh
 {
 	cl_command_queue m_queue;
-	
+
 	cl_program m_parallelLinearBvhProgram;
-	
+
 	cl_kernel m_separateAabbsKernel;
 	cl_kernel m_findAllNodesMergedAabbKernel;
 	cl_kernel m_assignMortonCodesAndAabbIndiciesKernel;
-	
+
 	//Binary radix tree construction kernels
 	cl_kernel m_computeAdjacentPairCommonPrefixKernel;
 	cl_kernel m_buildBinaryRadixTreeLeafNodesKernel;
 	cl_kernel m_buildBinaryRadixTreeInternalNodesKernel;
 	cl_kernel m_findDistanceFromRootKernel;
 	cl_kernel m_buildBinaryRadixTreeAabbsRecursiveKernel;
-	
+
 	cl_kernel m_findLeafIndexRangesKernel;
-	
+
 	//Traversal kernels
 	cl_kernel m_plbvhCalculateOverlappingPairsKernel;
 	cl_kernel m_plbvhRayTraverseKernel;
 	cl_kernel m_plbvhLargeAabbAabbTestKernel;
 	cl_kernel m_plbvhLargeAabbRayTestKernel;
-	
+
 	b3RadixSort32CL m_radixSorter;
-	
+
 	//1 element
-	b3OpenCLArray<int> m_rootNodeIndex;							//Most significant bit(0x80000000) is set to indicate internal node
-	b3OpenCLArray<int> m_maxDistanceFromRoot;					//Max number of internal nodes between an internal node and the root node
-	b3OpenCLArray<int> m_temp;									//Used to hold the number of pairs in calculateOverlappingPairs()
-	
+	b3OpenCLArray<int> m_rootNodeIndex;        //Most significant bit(0x80000000) is set to indicate internal node
+	b3OpenCLArray<int> m_maxDistanceFromRoot;  //Max number of internal nodes between an internal node and the root node
+	b3OpenCLArray<int> m_temp;                 //Used to hold the number of pairs in calculateOverlappingPairs()
+
 	//1 element per internal node (number_of_internal_nodes == number_of_leaves - 1)
 	b3OpenCLArray<b3SapAabb> m_internalNodeAabbs;
-	b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges;		//x == min leaf index, y == max leaf index
-	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;				//x == left child, y == right child; msb(0x80000000) is set to indicate internal node
-	b3OpenCLArray<int> m_internalNodeParentNodes;				//For parent node index, msb(0x80000000) is not set since it is always internal
-	
+	b3OpenCLArray<b3Int2> m_internalNodeLeafIndexRanges;  //x == min leaf index, y == max leaf index
+	b3OpenCLArray<b3Int2> m_internalNodeChildNodes;       //x == left child, y == right child; msb(0x80000000) is set to indicate internal node
+	b3OpenCLArray<int> m_internalNodeParentNodes;         //For parent node index, msb(0x80000000) is not set since it is always internal
+
 	//1 element per internal node; for binary radix tree construction
 	b3OpenCLArray<b3Int64> m_commonPrefixes;
 	b3OpenCLArray<int> m_commonPrefixLengths;
-	b3OpenCLArray<int> m_distanceFromRoot;						//Number of internal nodes between this node and the root
-	
+	b3OpenCLArray<int> m_distanceFromRoot;  //Number of internal nodes between this node and the root
+
 	//1 element per leaf node (leaf nodes only include small AABBs)
-	b3OpenCLArray<int> m_leafNodeParentNodes;					//For parent node index, msb(0x80000000) is not set since it is always internal
-	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;		//m_key == morton code, m_value == aabb index in m_leafNodeAabbs
-	b3OpenCLArray<b3SapAabb> m_mergedAabb;						//m_mergedAabb[0] contains the merged AABB of all leaf nodes
-	b3OpenCLArray<b3SapAabb> m_leafNodeAabbs;					//Contains only small AABBs
-	
+	b3OpenCLArray<int> m_leafNodeParentNodes;                //For parent node index, msb(0x80000000) is not set since it is always internal
+	b3OpenCLArray<b3SortData> m_mortonCodesAndAabbIndicies;  //m_key == morton code, m_value == aabb index in m_leafNodeAabbs
+	b3OpenCLArray<b3SapAabb> m_mergedAabb;                   //m_mergedAabb[0] contains the merged AABB of all leaf nodes
+	b3OpenCLArray<b3SapAabb> m_leafNodeAabbs;                //Contains only small AABBs
+
 	//1 element per large AABB, which is not stored in the BVH
 	b3OpenCLArray<b3SapAabb> m_largeAabbs;
-	
+
 public:
 	b3GpuParallelLinearBvh(cl_context context, cl_device_id device, cl_command_queue queue);
 	virtual ~b3GpuParallelLinearBvh();
-	
+
 	///Must be called before any other function
-	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices, 
-				const b3OpenCLArray<int>& largeAabbIndices);
-	
+	void build(const b3OpenCLArray<b3SapAabb>& worldSpaceAabbs, const b3OpenCLArray<int>& smallAabbIndices,
+			   const b3OpenCLArray<int>& largeAabbIndices);
+
 	///calculateOverlappingPairs() uses the worldSpaceAabbs parameter of b3GpuParallelLinearBvh::build() as the query AABBs.
 	///@param out_overlappingPairs The size() of this array is used to determine the max number of pairs.
 	///If the number of overlapping pairs is < out_overlappingPairs.size(), out_overlappingPairs is resized.
 	void calculateOverlappingPairs(b3OpenCLArray<b3Int4>& out_overlappingPairs);
-	
+
 	///@param out_numRigidRayPairs Array of length 1; contains the number of detected ray-rigid AABB intersections;
 	///this value may be greater than out_rayRigidPairs.size() if out_rayRigidPairs is not large enough.
 	///@param out_rayRigidPairs Contains an array of rays intersecting rigid AABBs; x == ray index, y == rigid body index.
 	///If the size of this array is insufficient to hold all ray-rigid AABB intersections, additional intersections are discarded.
-	void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays, 
-								b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
-								
+	void testRaysAgainstBvhAabbs(const b3OpenCLArray<b3RayInfo>& rays,
+								 b3OpenCLArray<int>& out_numRayRigidPairs, b3OpenCLArray<b3Int2>& out_rayRigidPairs);
+
 private:
 	void constructBinaryRadixTree();
 };
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.cpp
@@ -13,45 +13,44 @@ subject to the following restrictions:

 #include "b3GpuParallelLinearBvhBroadphase.h"

-b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : 
-	m_plbvh(context, device, queue),
-	
-	m_overlappingPairsGpu(context, queue),
-	
-	m_aabbsGpu(context, queue),
-	m_smallAabbsMappingGpu(context, queue),
-	m_largeAabbsMappingGpu(context, queue)
+b3GpuParallelLinearBvhBroadphase::b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue) : m_plbvh(context, device, queue),
+
+																																	  m_overlappingPairsGpu(context, queue),
+
+																																	  m_aabbsGpu(context, queue),
+																																	  m_smallAabbsMappingGpu(context, queue),
+																																	  m_largeAabbsMappingGpu(context, queue)
 {
 }

-void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask)
+void b3GpuParallelLinearBvhBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
 {
 	int newAabbIndex = m_aabbsCpu.size();

 	b3SapAabb aabb;
 	aabb.m_minVec = aabbMin;
 	aabb.m_maxVec = aabbMax;
-	
+
 	aabb.m_minIndices[3] = userPtr;
 	aabb.m_signedMaxIndices[3] = newAabbIndex;
-	
+
 	m_smallAabbsMappingCpu.push_back(newAabbIndex);
-	
+
 	m_aabbsCpu.push_back(aabb);
 }
-void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask)
+void b3GpuParallelLinearBvhBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
 {
 	int newAabbIndex = m_aabbsCpu.size();

 	b3SapAabb aabb;
 	aabb.m_minVec = aabbMin;
 	aabb.m_maxVec = aabbMax;
-	
+
 	aabb.m_minIndices[3] = userPtr;
 	aabb.m_signedMaxIndices[3] = newAabbIndex;
-	
+
 	m_largeAabbsMappingCpu.push_back(newAabbIndex);
-	
+
 	m_aabbsCpu.push_back(aabb);
 }

@@ -59,22 +58,19 @@ void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairs(int maxPairs)
 {
 	//Reconstruct BVH
 	m_plbvh.build(m_aabbsGpu, m_smallAabbsMappingGpu, m_largeAabbsMappingGpu);
-	
+
 	//
 	m_overlappingPairsGpu.resize(maxPairs);
 	m_plbvh.calculateOverlappingPairs(m_overlappingPairsGpu);
 }
 void b3GpuParallelLinearBvhBroadphase::calculateOverlappingPairsHost(int maxPairs)
 {
-	b3Assert(0);	//CPU version not implemented
+	b3Assert(0);  //CPU version not implemented
 }

-void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu() 
-{ 
-	m_aabbsGpu.copyFromHost(m_aabbsCpu); 
+void b3GpuParallelLinearBvhBroadphase::writeAabbsToGpu()
+{
+	m_aabbsGpu.copyFromHost(m_aabbsCpu);
 	m_smallAabbsMappingGpu.copyFromHost(m_smallAabbsMappingCpu);
 	m_largeAabbsMappingGpu.copyFromHost(m_largeAabbsMappingCpu);
 }
-	
-	
-	
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuParallelLinearBvhBroadphase.h
@@ -21,42 +21,42 @@ subject to the following restrictions:
 class b3GpuParallelLinearBvhBroadphase : public b3GpuBroadphaseInterface
 {
 	b3GpuParallelLinearBvh m_plbvh;
-	
+
 	b3OpenCLArray<b3Int4> m_overlappingPairsGpu;
-	
+
 	b3OpenCLArray<b3SapAabb> m_aabbsGpu;
 	b3OpenCLArray<int> m_smallAabbsMappingGpu;
 	b3OpenCLArray<int> m_largeAabbsMappingGpu;
-	
+
 	b3AlignedObjectArray<b3SapAabb> m_aabbsCpu;
 	b3AlignedObjectArray<int> m_smallAabbsMappingCpu;
 	b3AlignedObjectArray<int> m_largeAabbsMappingCpu;
-	
+
 public:
 	b3GpuParallelLinearBvhBroadphase(cl_context context, cl_device_id device, cl_command_queue queue);
 	virtual ~b3GpuParallelLinearBvhBroadphase() {}

-	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask);
-	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr,  int collisionFilterGroup,  int collisionFilterMask);
-	
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+
 	virtual void calculateOverlappingPairs(int maxPairs);
 	virtual void calculateOverlappingPairsHost(int maxPairs);

 	//call writeAabbsToGpu after done making all changes (createProxy etc)
 	virtual void writeAabbsToGpu();
-	
-	virtual int	getNumOverlap() { return m_overlappingPairsGpu.size(); }
+
+	virtual int getNumOverlap() { return m_overlappingPairsGpu.size(); }
 	virtual cl_mem getOverlappingPairBuffer() { return m_overlappingPairsGpu.getBufferCL(); }

 	virtual cl_mem getAabbBufferWS() { return m_aabbsGpu.getBufferCL(); }
 	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU() { return m_aabbsGpu; }
-	
+
 	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU() { return m_overlappingPairsGpu; }
 	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU() { return m_smallAabbsMappingGpu; }
 	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU() { return m_largeAabbsMappingGpu; }
-	
+
 	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU() { return m_aabbsCpu; }
-	
+
 	static b3GpuBroadphaseInterface* CreateFunc(cl_context context, cl_device_id device, cl_command_queue queue)
 	{
 		return new b3GpuParallelLinearBvhBroadphase(context, device, queue);
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@@ -2,7 +2,7 @@
 #define B3_GPU_SAP_BROADPHASE_H

 #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
-#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h"  //b3Int2
 class b3Vector3;
 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"

@@ -11,141 +11,133 @@ class b3Vector3;

 #include "b3GpuBroadphaseInterface.h"

-
 class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
 {
-	
-	cl_context				m_context;
-	cl_device_id			m_device;
-	cl_command_queue		m_queue;
-	cl_kernel				m_flipFloatKernel;
-	cl_kernel				m_scatterKernel ;
-	cl_kernel				m_copyAabbsKernel;
-	cl_kernel				m_sapKernel;
-	cl_kernel				m_sap2Kernel;
-	cl_kernel				m_prepareSumVarianceKernel;
-	
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+	cl_kernel m_flipFloatKernel;
+	cl_kernel m_scatterKernel;
+	cl_kernel m_copyAabbsKernel;
+	cl_kernel m_sapKernel;
+	cl_kernel m_sap2Kernel;
+	cl_kernel m_prepareSumVarianceKernel;

 	class b3RadixSort32CL* m_sorter;

 	///test for 3d SAP
-	b3AlignedObjectArray<b3SortData>		m_sortedAxisCPU[3][2];
-	b3AlignedObjectArray<b3UnsignedInt2>	m_objectMinMaxIndexCPU[3][2];
-	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0;
-	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1;
-	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2;
-	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0prev;
-	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1prev;
-	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2prev;
+	b3AlignedObjectArray<b3SortData> m_sortedAxisCPU[3][2];
+	b3AlignedObjectArray<b3UnsignedInt2> m_objectMinMaxIndexCPU[3][2];
+	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0;
+	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1;
+	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2;
+	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis0prev;
+	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis1prev;
+	b3OpenCLArray<b3UnsignedInt2> m_objectMinMaxIndexGPUaxis2prev;

-	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0;
-	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1;
-	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2;
-	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0prev;
-	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1prev;
-	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2prev;
+	b3OpenCLArray<b3SortData> m_sortedAxisGPU0;
+	b3OpenCLArray<b3SortData> m_sortedAxisGPU1;
+	b3OpenCLArray<b3SortData> m_sortedAxisGPU2;
+	b3OpenCLArray<b3SortData> m_sortedAxisGPU0prev;
+	b3OpenCLArray<b3SortData> m_sortedAxisGPU1prev;
+	b3OpenCLArray<b3SortData> m_sortedAxisGPU2prev;

+	b3OpenCLArray<b3Int4> m_addedHostPairsGPU;
+	b3OpenCLArray<b3Int4> m_removedHostPairsGPU;
+	b3OpenCLArray<int> m_addedCountGPU;
+	b3OpenCLArray<int> m_removedCountGPU;

-	b3OpenCLArray<b3Int4>					m_addedHostPairsGPU;
-	b3OpenCLArray<b3Int4>					m_removedHostPairsGPU;
-	b3OpenCLArray<int>						m_addedCountGPU;
-	b3OpenCLArray<int>						m_removedCountGPU;
-	
-	int	m_currentBuffer;
+	int m_currentBuffer;

 public:
-
 	b3OpenCLArray<int> m_pairCount;

+	b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
+	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;

-	b3OpenCLArray<b3SapAabb>	m_allAabbsGPU;
-	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU;
-
-	virtual b3OpenCLArray<b3SapAabb>&	getAllAabbsGPU()
+	virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
 	{
 		return m_allAabbsGPU;
 	}
-	virtual b3AlignedObjectArray<b3SapAabb>&	getAllAabbsCPU()
+	virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
 	{
 		return m_allAabbsCPU;
 	}

-	b3OpenCLArray<b3Vector3>	m_sum;
-	b3OpenCLArray<b3Vector3>	m_sum2;
-	b3OpenCLArray<b3Vector3>	m_dst;
+	b3OpenCLArray<b3Vector3> m_sum;
+	b3OpenCLArray<b3Vector3> m_sum2;
+	b3OpenCLArray<b3Vector3> m_dst;

-	b3OpenCLArray<int>	m_smallAabbsMappingGPU;
+	b3OpenCLArray<int> m_smallAabbsMappingGPU;
 	b3AlignedObjectArray<int> m_smallAabbsMappingCPU;

-	b3OpenCLArray<int>	m_largeAabbsMappingGPU;
+	b3OpenCLArray<int> m_largeAabbsMappingGPU;
 	b3AlignedObjectArray<int> m_largeAabbsMappingCPU;

-	
-	b3OpenCLArray<b3Int4>		m_overlappingPairs;
+	b3OpenCLArray<b3Int4> m_overlappingPairs;

 	//temporary gpu work memory
-	b3OpenCLArray<b3SortData>	m_gpuSmallSortData;
-	b3OpenCLArray<b3SapAabb>	m_gpuSmallSortedAabbs;
+	b3OpenCLArray<b3SortData> m_gpuSmallSortData;
+	b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;

-	class b3PrefixScanFloat4CL*		m_prefixScanFloat4;
+	class b3PrefixScanFloat4CL* m_prefixScanFloat4;

 	enum b3GpuSapKernelType
 	{
-		B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU=1,
+		B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU = 1,
 		B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
 		B3_GPU_SAP_KERNEL_ORIGINAL,
 		B3_GPU_SAP_KERNEL_BARRIER,
 		B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
 	};

-	b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
+	b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType = B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
 	virtual ~b3GpuSapBroadphase();
-	
-	static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx,cl_device_id device, cl_command_queue  q)
+
+	static b3GpuBroadphaseInterface* CreateFuncBruteForceCpu(cl_context ctx, cl_device_id device, cl_command_queue q)
 	{
-		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
+		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU);
 	}

-	static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	static b3GpuBroadphaseInterface* CreateFuncBruteForceGpu(cl_context ctx, cl_device_id device, cl_command_queue q)
 	{
-		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
+		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU);
 	}

-	static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	static b3GpuBroadphaseInterface* CreateFuncOriginal(cl_context ctx, cl_device_id device, cl_command_queue q)
 	{
-		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_ORIGINAL);
+		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_ORIGINAL);
 	}
-	static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	static b3GpuBroadphaseInterface* CreateFuncBarrier(cl_context ctx, cl_device_id device, cl_command_queue q)
 	{
-		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_BARRIER);
+		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_BARRIER);
 	}
-	static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx,cl_device_id device, cl_command_queue  q)
+	static b3GpuBroadphaseInterface* CreateFuncLocalMemory(cl_context ctx, cl_device_id device, cl_command_queue q)
 	{
-		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
+		return new b3GpuSapBroadphase(ctx, device, q, B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
 	}
-	

-	virtual void  calculateOverlappingPairs(int maxPairs);
-	virtual void  calculateOverlappingPairsHost(int maxPairs);
-	
-	void  reset();
+	virtual void calculateOverlappingPairs(int maxPairs);
+	virtual void calculateOverlappingPairsHost(int maxPairs);
+
+	void reset();

 	void init3dSap();
 	virtual void calculateOverlappingPairsHostIncremental3Sap();

-	virtual void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
-	virtual void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr , int collisionFilterGroup, int collisionFilterMask);
+	virtual void createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);
+	virtual void createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask);

 	//call writeAabbsToGpu after done making all changes (createProxy etc)
 	virtual void writeAabbsToGpu();

-	virtual cl_mem	getAabbBufferWS();
-	virtual int	getNumOverlap();
-	virtual cl_mem	getOverlappingPairBuffer();
-	
+	virtual cl_mem getAabbBufferWS();
+	virtual int getNumOverlap();
+	virtual cl_mem getOverlappingPairBuffer();
+
 	virtual b3OpenCLArray<b3Int4>& getOverlappingPairsGPU();
 	virtual b3OpenCLArray<int>& getSmallAabbIndicesGPU();
 	virtual b3OpenCLArray<int>& getLargeAabbIndicesGPU();
 };

-#endif //B3_GPU_SAP_BROADPHASE_H
+#endif  //B3_GPU_SAP_BROADPHASE_H
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
@@ -5,10 +5,9 @@
 #include "Bullet3Collision/BroadPhaseCollision/shared/b3Aabb.h"

 ///just make sure that the b3Aabb is 16-byte aligned
-B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb : public b3Aabb
-{
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3SapAabb : public b3Aabb{

-};
+			};

-
-#endif //B3_SAP_AABB_H
+#endif  //B3_SAP_AABB_H
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h
@@ -1,199 +1,198 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* gridBroadphaseCL= \
-"int getPosHash(int4 gridPos, __global float4* pParams)\n"
-"{\n"
-"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
-"	gridPos.x &= gridDim.x - 1;\n"
-"	gridPos.y &= gridDim.y - 1;\n"
-"	gridPos.z &= gridDim.z - 1;\n"
-"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
-"	return hash;\n"
-"} \n"
-"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
-"{\n"
-"    int4 gridPos;\n"
-"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
-"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
-"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
-"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
-"    return gridPos;\n"
-"}\n"
-"// calculate grid hash value for each body using its AABB\n"
-"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
-"{\n"
-"    int index = get_global_id(0);\n"
-"    if(index >= numObjects)\n"
-"	{\n"
-"		return;\n"
-"	}\n"
-"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
-"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
-"	float4 pos;\n"
-"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-"	pos.w = 0.f;\n"
-"    // get address in grid\n"
-"    int4 gridPos = getGridPos(pos, pParams);\n"
-"    int gridHash = getPosHash(gridPos, pParams);\n"
-"    // store grid hash and body index\n"
-"    int2 hashVal;\n"
-"    hashVal.x = gridHash;\n"
-"    hashVal.y = index;\n"
-"    pHash[index] = hashVal;\n"
-"}\n"
-"__kernel void kClearCellStart(	int numCells, \n"
-"								__global int* pCellStart )\n"
-"{\n"
-"    int index = get_global_id(0);\n"
-"    if(index >= numCells)\n"
-"	{\n"
-"		return;\n"
-"	}\n"
-"	pCellStart[index] = -1;\n"
-"}\n"
-"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
-"{\n"
-"	__local int sharedHash[513];\n"
-"    int index = get_global_id(0);\n"
-"	int2 sortedData;\n"
-"    if(index < numObjects)\n"
-"	{\n"
-"		sortedData = pHash[index];\n"
-"		// Load hash data into shared memory so that we can look \n"
-"		// at neighboring body's hash value without loading\n"
-"		// two hash values per thread\n"
-"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
-"		if((index > 0) && (get_local_id(0) == 0))\n"
-"		{\n"
-"			// first thread in block must load neighbor body hash\n"
-"			sharedHash[0] = pHash[index-1].x;\n"
-"		}\n"
-"	}\n"
-"    barrier(CLK_LOCAL_MEM_FENCE);\n"
-"    if(index < numObjects)\n"
-"	{\n"
-"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
-"		{\n"
-"			cellStart[sortedData.x] = index;\n"
-"		}\n"
-"	}\n"
-"}\n"
-"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
-"{\n"
-"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
-"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
-"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n"
-"}\n"
-"//search for AABB 'index' against other AABBs' in this cell\n"
-"void findPairsInCell(	int numObjects,\n"
-"						int4	gridPos,\n"
-"						int    index,\n"
-"						__global int2*  pHash,\n"
-"						__global int*   pCellStart,\n"
-"						__global float4* allpAABB, \n"
-"						__global const int* smallAabbMapping,\n"
-"						__global float4* pParams,\n"
-"							volatile  __global int* pairCount,\n"
-"						__global int4*   pPairBuff2,\n"
-"						int maxPairs\n"
-"						)\n"
-"{\n"
-"	int4 pGridDim = *((__global int4*)(pParams + 1));\n"
-"	int maxBodiesPerCell = pGridDim.w;\n"
-"    int gridHash = getPosHash(gridPos, pParams);\n"
-"    // get start of bucket for this cell\n"
-"    int bucketStart = pCellStart[gridHash];\n"
-"    if (bucketStart == -1)\n"
-"	{\n"
-"        return;   // cell empty\n"
-"	}\n"
-"	// iterate over bodies in this cell\n"
-"    int2 sortedData = pHash[index];\n"
-"	int unsorted_indx = sortedData.y;\n"
-"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
-"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-"	int handleIndex =  as_int(min0.w);\n"
-"	\n"
-"	int bucketEnd = bucketStart + maxBodiesPerCell;\n"
-"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
-"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
-"	{\n"
-"        int2 cellData = pHash[index2];\n"
-"        if (cellData.x != gridHash)\n"
-"        {\n"
-"			break;   // no longer in same bucket\n"
-"		}\n"
-"		int unsorted_indx2 = cellData.y;\n"
-"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
-"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
-"        {   \n"
-"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
-"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
-"			if(testAABBOverlap(min0, max0, min1, max1))\n"
-"			{\n"
-"				if (pairCount)\n"
-"				{\n"
-"					int handleIndex2 = as_int(min1.w);\n"
-"					if (handleIndex<handleIndex2)\n"
-"					{\n"
-"						int curPair = atomic_add(pairCount,1);\n"
-"						if (curPair<maxPairs)\n"
-"						{\n"
-"							int4 newpair;\n"
-"							newpair.x = handleIndex;\n"
-"							newpair.y = handleIndex2;\n"
-"							newpair.z = -1;\n"
-"							newpair.w = -1;\n"
-"							pPairBuff2[curPair] = newpair;\n"
-"						}\n"
-"					}\n"
-"				\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void kFindOverlappingPairs(	int numObjects,\n"
-"										__global float4* allpAABB, \n"
-"										__global const int* smallAabbMapping,\n"
-"										__global int2* pHash, \n"
-"										__global int* pCellStart, \n"
-"										__global float4* pParams ,\n"
-"										volatile  __global int* pairCount,\n"
-"										__global int4*   pPairBuff2,\n"
-"										int maxPairs\n"
-"										)\n"
-"{\n"
-"    int index = get_global_id(0);\n"
-"    if(index >= numObjects)\n"
-"	{\n"
-"		return;\n"
-"	}\n"
-"    int2 sortedData = pHash[index];\n"
-"	int unsorted_indx = sortedData.y;\n"
-"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
-"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
-"	float4 pos;\n"
-"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
-"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
-"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
-"    // get address in grid\n"
-"    int4 gridPosA = getGridPos(pos, pParams);\n"
-"    int4 gridPosB; \n"
-"    // examine only neighbouring cells\n"
-"    for(int z=-1; z<=1; z++) \n"
-"    {\n"
-"		gridPosB.z = gridPosA.z + z;\n"
-"        for(int y=-1; y<=1; y++) \n"
-"        {\n"
-"			gridPosB.y = gridPosA.y + y;\n"
-"            for(int x=-1; x<=1; x++) \n"
-"            {\n"
-"				gridPosB.x = gridPosA.x + x;\n"
-"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
-"            }\n"
-"        }\n"
-"    }\n"
-"}\n"
-;
+static const char* gridBroadphaseCL =
+	"int getPosHash(int4 gridPos, __global float4* pParams)\n"
+	"{\n"
+	"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
+	"	gridPos.x &= gridDim.x - 1;\n"
+	"	gridPos.y &= gridDim.y - 1;\n"
+	"	gridPos.z &= gridDim.z - 1;\n"
+	"	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;\n"
+	"	return hash;\n"
+	"} \n"
+	"int4 getGridPos(float4 worldPos, __global float4* pParams)\n"
+	"{\n"
+	"    int4 gridPos;\n"
+	"	int4 gridDim = *((__global int4*)(pParams + 1));\n"
+	"    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);\n"
+	"    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);\n"
+	"    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);\n"
+	"    return gridPos;\n"
+	"}\n"
+	"// calculate grid hash value for each body using its AABB\n"
+	"__kernel void kCalcHashAABB(int numObjects, __global float4* allpAABB, __global const int* smallAabbMapping, __global int2* pHash, __global float4* pParams )\n"
+	"{\n"
+	"    int index = get_global_id(0);\n"
+	"    if(index >= numObjects)\n"
+	"	{\n"
+	"		return;\n"
+	"	}\n"
+	"	float4 bbMin = allpAABB[smallAabbMapping[index]*2];\n"
+	"	float4 bbMax = allpAABB[smallAabbMapping[index]*2 + 1];\n"
+	"	float4 pos;\n"
+	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+	"	pos.w = 0.f;\n"
+	"    // get address in grid\n"
+	"    int4 gridPos = getGridPos(pos, pParams);\n"
+	"    int gridHash = getPosHash(gridPos, pParams);\n"
+	"    // store grid hash and body index\n"
+	"    int2 hashVal;\n"
+	"    hashVal.x = gridHash;\n"
+	"    hashVal.y = index;\n"
+	"    pHash[index] = hashVal;\n"
+	"}\n"
+	"__kernel void kClearCellStart(	int numCells, \n"
+	"								__global int* pCellStart )\n"
+	"{\n"
+	"    int index = get_global_id(0);\n"
+	"    if(index >= numCells)\n"
+	"	{\n"
+	"		return;\n"
+	"	}\n"
+	"	pCellStart[index] = -1;\n"
+	"}\n"
+	"__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart )\n"
+	"{\n"
+	"	__local int sharedHash[513];\n"
+	"    int index = get_global_id(0);\n"
+	"	int2 sortedData;\n"
+	"    if(index < numObjects)\n"
+	"	{\n"
+	"		sortedData = pHash[index];\n"
+	"		// Load hash data into shared memory so that we can look \n"
+	"		// at neighboring body's hash value without loading\n"
+	"		// two hash values per thread\n"
+	"		sharedHash[get_local_id(0) + 1] = sortedData.x;\n"
+	"		if((index > 0) && (get_local_id(0) == 0))\n"
+	"		{\n"
+	"			// first thread in block must load neighbor body hash\n"
+	"			sharedHash[0] = pHash[index-1].x;\n"
+	"		}\n"
+	"	}\n"
+	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"    if(index < numObjects)\n"
+	"	{\n"
+	"		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))\n"
+	"		{\n"
+	"			cellStart[sortedData.x] = index;\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)\n"
+	"{\n"
+	"	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && \n"
+	"			(min0.y <= max1.y)&& (min1.y <= max0.y) && \n"
+	"			(min0.z <= max1.z)&& (min1.z <= max0.z); \n"
+	"}\n"
+	"//search for AABB 'index' against other AABBs' in this cell\n"
+	"void findPairsInCell(	int numObjects,\n"
+	"						int4	gridPos,\n"
+	"						int    index,\n"
+	"						__global int2*  pHash,\n"
+	"						__global int*   pCellStart,\n"
+	"						__global float4* allpAABB, \n"
+	"						__global const int* smallAabbMapping,\n"
+	"						__global float4* pParams,\n"
+	"							volatile  __global int* pairCount,\n"
+	"						__global int4*   pPairBuff2,\n"
+	"						int maxPairs\n"
+	"						)\n"
+	"{\n"
+	"	int4 pGridDim = *((__global int4*)(pParams + 1));\n"
+	"	int maxBodiesPerCell = pGridDim.w;\n"
+	"    int gridHash = getPosHash(gridPos, pParams);\n"
+	"    // get start of bucket for this cell\n"
+	"    int bucketStart = pCellStart[gridHash];\n"
+	"    if (bucketStart == -1)\n"
+	"	{\n"
+	"        return;   // cell empty\n"
+	"	}\n"
+	"	// iterate over bodies in this cell\n"
+	"    int2 sortedData = pHash[index];\n"
+	"	int unsorted_indx = sortedData.y;\n"
+	"    float4 min0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0]; \n"
+	"	float4 max0 = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+	"	int handleIndex =  as_int(min0.w);\n"
+	"	\n"
+	"	int bucketEnd = bucketStart + maxBodiesPerCell;\n"
+	"	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;\n"
+	"	for(int index2 = bucketStart; index2 < bucketEnd; index2++) \n"
+	"	{\n"
+	"        int2 cellData = pHash[index2];\n"
+	"        if (cellData.x != gridHash)\n"
+	"        {\n"
+	"			break;   // no longer in same bucket\n"
+	"		}\n"
+	"		int unsorted_indx2 = cellData.y;\n"
+	"        //if (unsorted_indx2 < unsorted_indx) // check not colliding with self\n"
+	"		if (unsorted_indx2 != unsorted_indx) // check not colliding with self\n"
+	"        {   \n"
+	"			float4 min1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 0];\n"
+	"			float4 max1 = allpAABB[smallAabbMapping[unsorted_indx2]*2 + 1];\n"
+	"			if(testAABBOverlap(min0, max0, min1, max1))\n"
+	"			{\n"
+	"				if (pairCount)\n"
+	"				{\n"
+	"					int handleIndex2 = as_int(min1.w);\n"
+	"					if (handleIndex<handleIndex2)\n"
+	"					{\n"
+	"						int curPair = atomic_add(pairCount,1);\n"
+	"						if (curPair<maxPairs)\n"
+	"						{\n"
+	"							int4 newpair;\n"
+	"							newpair.x = handleIndex;\n"
+	"							newpair.y = handleIndex2;\n"
+	"							newpair.z = -1;\n"
+	"							newpair.w = -1;\n"
+	"							pPairBuff2[curPair] = newpair;\n"
+	"						}\n"
+	"					}\n"
+	"				\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void kFindOverlappingPairs(	int numObjects,\n"
+	"										__global float4* allpAABB, \n"
+	"										__global const int* smallAabbMapping,\n"
+	"										__global int2* pHash, \n"
+	"										__global int* pCellStart, \n"
+	"										__global float4* pParams ,\n"
+	"										volatile  __global int* pairCount,\n"
+	"										__global int4*   pPairBuff2,\n"
+	"										int maxPairs\n"
+	"										)\n"
+	"{\n"
+	"    int index = get_global_id(0);\n"
+	"    if(index >= numObjects)\n"
+	"	{\n"
+	"		return;\n"
+	"	}\n"
+	"    int2 sortedData = pHash[index];\n"
+	"	int unsorted_indx = sortedData.y;\n"
+	"	float4 bbMin = allpAABB[smallAabbMapping[unsorted_indx]*2 + 0];\n"
+	"	float4 bbMax = allpAABB[smallAabbMapping[unsorted_indx]*2 + 1];\n"
+	"	float4 pos;\n"
+	"	pos.x = (bbMin.x + bbMax.x) * 0.5f;\n"
+	"	pos.y = (bbMin.y + bbMax.y) * 0.5f;\n"
+	"	pos.z = (bbMin.z + bbMax.z) * 0.5f;\n"
+	"    // get address in grid\n"
+	"    int4 gridPosA = getGridPos(pos, pParams);\n"
+	"    int4 gridPosB; \n"
+	"    // examine only neighbouring cells\n"
+	"    for(int z=-1; z<=1; z++) \n"
+	"    {\n"
+	"		gridPosB.z = gridPosA.z + z;\n"
+	"        for(int y=-1; y<=1; y++) \n"
+	"        {\n"
+	"			gridPosB.y = gridPosA.y + y;\n"
+	"            for(int x=-1; x<=1; x++) \n"
+	"            {\n"
+	"				gridPosB.x = gridPosA.x + x;\n"
+	"                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, allpAABB,smallAabbMapping, pParams, pairCount,pPairBuff2, maxPairs);\n"
+	"            }\n"
+	"        }\n"
+	"    }\n"
+	"}\n";
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/parallelLinearBvhKernels.h
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@@ -1,342 +1,341 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* sapCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Erwin Coumans\n"
-"#define NEW_PAIR_MARKER -1\n"
-"typedef struct \n"
-"{\n"
-"	union\n"
-"	{\n"
-"		float4	m_min;\n"
-"		float   m_minElems[4];\n"
-"		int			m_minIndices[4];\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		float4	m_max;\n"
-"		float   m_maxElems[4];\n"
-"		int			m_maxIndices[4];\n"
-"	};\n"
-"} btAabbCL;\n"
-"/// conservative test for overlap between two aabbs\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numUnsortedAabbs)\n"
-"		return;\n"
-"	int j = get_global_id(1);\n"
-"	if (j>=numUnSortedAabbs2)\n"
-"		return;\n"
-"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
-"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
-"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
-"	{\n"
-"		int4 myPair;\n"
-"		\n"
-"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
-"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
-"		if (xIndex>yIndex)\n"
-"		{\n"
-"			int tmp = xIndex;\n"
-"			xIndex=yIndex;\n"
-"			yIndex=tmp;\n"
-"		}\n"
-"		\n"
-"		myPair.x = xIndex;\n"
-"		myPair.y = yIndex;\n"
-"		myPair.z = NEW_PAIR_MARKER;\n"
-"		myPair.w = NEW_PAIR_MARKER;\n"
-"		int curPair = atomic_inc (pairCount);\n"
-"		if (curPair<maxPairs)\n"
-"		{\n"
-"				pairsOut[curPair] = myPair; //flush to main memory\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	for (int j=i+1;j<numObjects;j++)\n"
-"	{\n"
-"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-"		{\n"
-"			int4 myPair;\n"
-"			myPair.x = aabbs[i].m_minIndices[3];\n"
-"			myPair.y = aabbs[j].m_minIndices[3];\n"
-"			myPair.z = NEW_PAIR_MARKER;\n"
-"			myPair.w = NEW_PAIR_MARKER;\n"
-"			int curPair = atomic_inc (pairCount);\n"
-"			if (curPair<maxPairs)\n"
-"			{\n"
-"					pairsOut[curPair] = myPair; //flush to main memory\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	for (int j=i+1;j<numObjects;j++)\n"
-"	{\n"
-"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-"		{\n"
-"			break;\n"
-"		}\n"
-"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-"		{\n"
-"			int4 myPair;\n"
-"			myPair.x = aabbs[i].m_minIndices[3];\n"
-"			myPair.y = aabbs[j].m_minIndices[3];\n"
-"			myPair.z = NEW_PAIR_MARKER;\n"
-"			myPair.w = NEW_PAIR_MARKER;\n"
-"			int curPair = atomic_inc (pairCount);\n"
-"			if (curPair<maxPairs)\n"
-"			{\n"
-"					pairsOut[curPair] = myPair; //flush to main memory\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	int localId = get_local_id(0);\n"
-"	__local int numActiveWgItems[1];\n"
-"	__local int breakRequest[1];\n"
-"	if (localId==0)\n"
-"	{\n"
-"		numActiveWgItems[0] = 0;\n"
-"		breakRequest[0] = 0;\n"
-"	}\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	atomic_inc(numActiveWgItems);\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	int localBreak = 0;\n"
-"	int j=i+1;\n"
-"	do\n"
-"	{\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	\n"
-"		if (j<numObjects)\n"
-"		{\n"
-"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
-"			{\n"
-"				if (!localBreak)\n"
-"				{\n"
-"					atomic_inc(breakRequest);\n"
-"					localBreak = 1;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (j>=numObjects && !localBreak)\n"
-"		{\n"
-"			atomic_inc(breakRequest);\n"
-"			localBreak = 1;\n"
-"		}\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (!localBreak)\n"
-"		{\n"
-"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
-"			{\n"
-"				int4 myPair;\n"
-"				myPair.x = aabbs[i].m_minIndices[3];\n"
-"				myPair.y = aabbs[j].m_minIndices[3];\n"
-"				myPair.z = NEW_PAIR_MARKER;\n"
-"				myPair.w = NEW_PAIR_MARKER;\n"
-"				int curPair = atomic_inc (pairCount);\n"
-"				if (curPair<maxPairs)\n"
-"				{\n"
-"						pairsOut[curPair] = myPair; //flush to main memory\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		j++;\n"
-"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
-"}\n"
-"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	int localId = get_local_id(0);\n"
-"	__local int numActiveWgItems[1];\n"
-"	__local int breakRequest[1];\n"
-"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
-"	\n"
-"	btAabbCL myAabb;\n"
-"	\n"
-"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
-"	float testValue = 	myAabb.m_maxElems[axis];\n"
-"	\n"
-"	if (localId==0)\n"
-"	{\n"
-"		numActiveWgItems[0] = 0;\n"
-"		breakRequest[0] = 0;\n"
-"	}\n"
-"	int localCount=0;\n"
-"	int block=0;\n"
-"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
-"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
-"	\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	atomic_inc(numActiveWgItems);\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	int localBreak = 0;\n"
-"	\n"
-"	int j=i+1;\n"
-"	do\n"
-"	{\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	\n"
-"		if (j<numObjects)\n"
-"		{\n"
-"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
-"			{\n"
-"				if (!localBreak)\n"
-"				{\n"
-"					atomic_inc(breakRequest);\n"
-"					localBreak = 1;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (j>=numObjects && !localBreak)\n"
-"		{\n"
-"			atomic_inc(breakRequest);\n"
-"			localBreak = 1;\n"
-"		}\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (!localBreak)\n"
-"		{\n"
-"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
-"			{\n"
-"				int4 myPair;\n"
-"				myPair.x = myAabb.m_minIndices[3];\n"
-"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
-"				myPair.z = NEW_PAIR_MARKER;\n"
-"				myPair.w = NEW_PAIR_MARKER;\n"
-"				int curPair = atomic_inc (pairCount);\n"
-"				if (curPair<maxPairs)\n"
-"				{\n"
-"						pairsOut[curPair] = myPair; //flush to main memory\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		localCount++;\n"
-"		if (localCount==64)\n"
-"		{\n"
-"			localCount = 0;\n"
-"			block+=64;			\n"
-"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
-"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
-"		}\n"
-"		j++;\n"
-"		\n"
-"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
-"	\n"
-"}\n"
-"//http://stereopsis.com/radix.html\n"
-"unsigned int FloatFlip(float fl);\n"
-"unsigned int FloatFlip(float fl)\n"
-"{\n"
-"	unsigned int f = *(unsigned int*)&fl;\n"
-"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
-"	return f ^ mask;\n"
-"}\n"
-"float IFloatFlip(unsigned int f);\n"
-"float IFloatFlip(unsigned int f)\n"
-"{\n"
-"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
-"	unsigned int fl = f ^ mask;\n"
-"	return *(float*)&fl;\n"
-"}\n"
-"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	int src = destAabbs[i].m_maxIndices[3];\n"
-"	destAabbs[i] = allAabbs[src];\n"
-"	destAabbs[i].m_maxIndices[3] = src;\n"
-"}\n"
-"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	\n"
-"	\n"
-"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
-"	sortData[i].y = i;\n"
-"		\n"
-"}\n"
-"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	\n"
-"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
-"}\n"
-"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numAabbs)\n"
-"		return;\n"
-"	\n"
-"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
-"	\n"
-"	float4 s;\n"
-"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
-"	sum[i]=s;\n"
-"	sum2[i]=s*s;	\n"
-"}\n"
-;
+static const char* sapCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Erwin Coumans\n"
+	"#define NEW_PAIR_MARKER -1\n"
+	"typedef struct \n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_min;\n"
+	"		float   m_minElems[4];\n"
+	"		int			m_minIndices[4];\n"
+	"	};\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_max;\n"
+	"		float   m_maxElems[4];\n"
+	"		int			m_maxIndices[4];\n"
+	"	};\n"
+	"} btAabbCL;\n"
+	"/// conservative test for overlap between two aabbs\n"
+	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+	"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+	"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+	"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+	"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const int* unsortedAabbMapping,  __global const int* unsortedAabbMapping2, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numUnSortedAabbs2, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numUnsortedAabbs)\n"
+	"		return;\n"
+	"	int j = get_global_id(1);\n"
+	"	if (j>=numUnSortedAabbs2)\n"
+	"		return;\n"
+	"	__global const btAabbCL* unsortedAabbPtr = &unsortedAabbs[unsortedAabbMapping[i]];\n"
+	"	__global const btAabbCL* unsortedAabbPtr2 = &unsortedAabbs[unsortedAabbMapping2[j]];\n"
+	"	if (TestAabbAgainstAabb2GlobalGlobal(unsortedAabbPtr,unsortedAabbPtr2))\n"
+	"	{\n"
+	"		int4 myPair;\n"
+	"		\n"
+	"		int xIndex = unsortedAabbPtr[0].m_minIndices[3];\n"
+	"		int yIndex = unsortedAabbPtr2[0].m_minIndices[3];\n"
+	"		if (xIndex>yIndex)\n"
+	"		{\n"
+	"			int tmp = xIndex;\n"
+	"			xIndex=yIndex;\n"
+	"			yIndex=tmp;\n"
+	"		}\n"
+	"		\n"
+	"		myPair.x = xIndex;\n"
+	"		myPair.y = yIndex;\n"
+	"		myPair.z = NEW_PAIR_MARKER;\n"
+	"		myPair.w = NEW_PAIR_MARKER;\n"
+	"		int curPair = atomic_inc (pairCount);\n"
+	"		if (curPair<maxPairs)\n"
+	"		{\n"
+	"				pairsOut[curPair] = myPair; //flush to main memory\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void   computePairsKernelBruteForce( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	for (int j=i+1;j<numObjects;j++)\n"
+	"	{\n"
+	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+	"		{\n"
+	"			int4 myPair;\n"
+	"			myPair.x = aabbs[i].m_minIndices[3];\n"
+	"			myPair.y = aabbs[j].m_minIndices[3];\n"
+	"			myPair.z = NEW_PAIR_MARKER;\n"
+	"			myPair.w = NEW_PAIR_MARKER;\n"
+	"			int curPair = atomic_inc (pairCount);\n"
+	"			if (curPair<maxPairs)\n"
+	"			{\n"
+	"					pairsOut[curPair] = myPair; //flush to main memory\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	for (int j=i+1;j<numObjects;j++)\n"
+	"	{\n"
+	"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+	"		{\n"
+	"			break;\n"
+	"		}\n"
+	"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+	"		{\n"
+	"			int4 myPair;\n"
+	"			myPair.x = aabbs[i].m_minIndices[3];\n"
+	"			myPair.y = aabbs[j].m_minIndices[3];\n"
+	"			myPair.z = NEW_PAIR_MARKER;\n"
+	"			myPair.w = NEW_PAIR_MARKER;\n"
+	"			int curPair = atomic_inc (pairCount);\n"
+	"			if (curPair<maxPairs)\n"
+	"			{\n"
+	"					pairsOut[curPair] = myPair; //flush to main memory\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	int localId = get_local_id(0);\n"
+	"	__local int numActiveWgItems[1];\n"
+	"	__local int breakRequest[1];\n"
+	"	if (localId==0)\n"
+	"	{\n"
+	"		numActiveWgItems[0] = 0;\n"
+	"		breakRequest[0] = 0;\n"
+	"	}\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	atomic_inc(numActiveWgItems);\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	int localBreak = 0;\n"
+	"	int j=i+1;\n"
+	"	do\n"
+	"	{\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	\n"
+	"		if (j<numObjects)\n"
+	"		{\n"
+	"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+	"			{\n"
+	"				if (!localBreak)\n"
+	"				{\n"
+	"					atomic_inc(breakRequest);\n"
+	"					localBreak = 1;\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (j>=numObjects && !localBreak)\n"
+	"		{\n"
+	"			atomic_inc(breakRequest);\n"
+	"			localBreak = 1;\n"
+	"		}\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (!localBreak)\n"
+	"		{\n"
+	"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+	"			{\n"
+	"				int4 myPair;\n"
+	"				myPair.x = aabbs[i].m_minIndices[3];\n"
+	"				myPair.y = aabbs[j].m_minIndices[3];\n"
+	"				myPair.z = NEW_PAIR_MARKER;\n"
+	"				myPair.w = NEW_PAIR_MARKER;\n"
+	"				int curPair = atomic_inc (pairCount);\n"
+	"				if (curPair<maxPairs)\n"
+	"				{\n"
+	"						pairsOut[curPair] = myPair; //flush to main memory\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		j++;\n"
+	"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+	"}\n"
+	"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	int localId = get_local_id(0);\n"
+	"	__local int numActiveWgItems[1];\n"
+	"	__local int breakRequest[1];\n"
+	"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
+	"	\n"
+	"	btAabbCL myAabb;\n"
+	"	\n"
+	"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+	"	float testValue = 	myAabb.m_maxElems[axis];\n"
+	"	\n"
+	"	if (localId==0)\n"
+	"	{\n"
+	"		numActiveWgItems[0] = 0;\n"
+	"		breakRequest[0] = 0;\n"
+	"	}\n"
+	"	int localCount=0;\n"
+	"	int block=0;\n"
+	"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+	"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+	"	\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	atomic_inc(numActiveWgItems);\n"
+	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	int localBreak = 0;\n"
+	"	\n"
+	"	int j=i+1;\n"
+	"	do\n"
+	"	{\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"	\n"
+	"		if (j<numObjects)\n"
+	"		{\n"
+	"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+	"			{\n"
+	"				if (!localBreak)\n"
+	"				{\n"
+	"					atomic_inc(breakRequest);\n"
+	"					localBreak = 1;\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (j>=numObjects && !localBreak)\n"
+	"		{\n"
+	"			atomic_inc(breakRequest);\n"
+	"			localBreak = 1;\n"
+	"		}\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		\n"
+	"		if (!localBreak)\n"
+	"		{\n"
+	"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+	"			{\n"
+	"				int4 myPair;\n"
+	"				myPair.x = myAabb.m_minIndices[3];\n"
+	"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+	"				myPair.z = NEW_PAIR_MARKER;\n"
+	"				myPair.w = NEW_PAIR_MARKER;\n"
+	"				int curPair = atomic_inc (pairCount);\n"
+	"				if (curPair<maxPairs)\n"
+	"				{\n"
+	"						pairsOut[curPair] = myPair; //flush to main memory\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+	"		localCount++;\n"
+	"		if (localCount==64)\n"
+	"		{\n"
+	"			localCount = 0;\n"
+	"			block+=64;			\n"
+	"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+	"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+	"		}\n"
+	"		j++;\n"
+	"		\n"
+	"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+	"	\n"
+	"}\n"
+	"//http://stereopsis.com/radix.html\n"
+	"unsigned int FloatFlip(float fl);\n"
+	"unsigned int FloatFlip(float fl)\n"
+	"{\n"
+	"	unsigned int f = *(unsigned int*)&fl;\n"
+	"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
+	"	return f ^ mask;\n"
+	"}\n"
+	"float IFloatFlip(unsigned int f);\n"
+	"float IFloatFlip(unsigned int f)\n"
+	"{\n"
+	"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
+	"	unsigned int fl = f ^ mask;\n"
+	"	return *(float*)&fl;\n"
+	"}\n"
+	"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	int src = destAabbs[i].m_maxIndices[3];\n"
+	"	destAabbs[i] = allAabbs[src];\n"
+	"	destAabbs[i].m_maxIndices[3] = src;\n"
+	"}\n"
+	"__kernel void   flipFloatKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global int2* sortData, int numObjects, int axis)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	\n"
+	"	\n"
+	"	sortData[i].x = FloatFlip(allAabbs[smallAabbMapping[i]].m_minElems[axis]);\n"
+	"	sortData[i].y = i;\n"
+	"		\n"
+	"}\n"
+	"__kernel void   scatterKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numObjects)\n"
+	"		return;\n"
+	"	\n"
+	"	sortedAabbs[i] = allAabbs[smallAabbMapping[sortData[i].y]];\n"
+	"}\n"
+	"__kernel void   prepareSumVarianceKernel( __global const btAabbCL* allAabbs, __global const int* smallAabbMapping, __global float4* sum, __global float4* sum2,int numAabbs)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numAabbs)\n"
+	"		return;\n"
+	"	\n"
+	"	btAabbCL smallAabb = allAabbs[smallAabbMapping[i]];\n"
+	"	\n"
+	"	float4 s;\n"
+	"	s = (smallAabb.m_max+smallAabb.m_min)*0.5f;\n"
+	"	sum[i]=s;\n"
+	"	sum2[i]=s*s;	\n"
+	"}\n";
--- a/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
+++ b/src/Bullet3OpenCL/Initialize/b3OpenCLInclude.h
@@ -17,7 +17,7 @@ subject to the following restrictions:
 #define B3_OPENCL_INCLUDE_H

 #ifdef B3_USE_CLEW
-	#include "clew/clew.h"
+#include "clew/clew.h"
 #else

 #ifdef __APPLE__
@@ -25,7 +25,7 @@ subject to the following restrictions:
 #include <MiniCL/cl.h>
 #else
 #include <OpenCL/cl.h>
-#include <OpenCL/cl_ext.h> //clLogMessagesToStderrAPPLE
+#include <OpenCL/cl_ext.h>  //clLogMessagesToStderrAPPLE
 #endif
 #else
 #ifdef USE_MINICL
@@ -34,15 +34,18 @@ subject to the following restrictions:
 #include <CL/cl.h>
 #ifdef _WIN32
 #include "CL/cl_gl.h"
-#endif //_WIN32
+#endif  //_WIN32
 #endif
-#endif //__APPLE__
-#endif //B3_USE_CLEW
+#endif  //__APPLE__
+#endif  //B3_USE_CLEW

 #include <assert.h>
 #include <stdio.h>
-#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
-
-
-#endif //B3_OPENCL_INCLUDE_H
+#define oclCHECKERROR(a, b)              \
+	if ((a) != (b))                      \
+	{                                    \
+		printf("OCL Error : %d\n", (a)); \
+		assert((a) == (b));              \
+	}

+#endif  //B3_OPENCL_INCLUDE_H
--- a/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
+++ b/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.cpp
--- a/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
+++ b/src/Bullet3OpenCL/Initialize/b3OpenCLUtils.h
@@ -22,42 +22,41 @@ subject to the following restrictions:
 #include "b3OpenCLInclude.h"

 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif

+	///C API for OpenCL utilities: convenience functions, see below for C++ API

-///C API for OpenCL utilities: convenience functions, see below for C++ API
+	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
+	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
+	cl_context b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex, cl_platform_id* platformId);

-/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
-/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
-cl_context 	b3OpenCLUtils_createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC , int preferredDeviceIndex , int preferredPlatformIndex, cl_platform_id* platformId);
-	
-int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);
+	int b3OpenCLUtils_getNumDevices(cl_context cxMainContext);

-cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);
+	cl_device_id b3OpenCLUtils_getDevice(cl_context cxMainContext, int nr);

-void b3OpenCLUtils_printDeviceInfo(cl_device_id device);
+	void b3OpenCLUtils_printDeviceInfo(cl_device_id device);

-cl_kernel b3OpenCLUtils_compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog,const char* additionalMacros);
+	cl_kernel b3OpenCLUtils_compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros);

-//optional
-cl_program b3OpenCLUtils_compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum,const char* additionalMacros  , const char* srcFileNameForCaching, bool disableBinaryCaching);
+	//optional
+	cl_program b3OpenCLUtils_compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros, const char* srcFileNameForCaching, bool disableBinaryCaching);

-//the following optional APIs provide access using specific platform information
-int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);
+	//the following optional APIs provide access using specific platform information
+	int b3OpenCLUtils_getNumPlatforms(cl_int* pErrNum);

-///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
-cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);
+	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
+	cl_platform_id b3OpenCLUtils_getPlatform(int nr, cl_int* pErrNum);

+	void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);

-void b3OpenCLUtils_printPlatformInfo(cl_platform_id platform);
+	const char* b3OpenCLUtils_getSdkVendorName();

-const char* b3OpenCLUtils_getSdkVendorName();
-	
-///set the path (directory/folder) where the compiled OpenCL kernel are stored
-void b3OpenCLUtils_setCachePath(const char* path);
-	
-cl_context 	b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx , void* pGLDC ,int preferredDeviceIndex , int preferredPlatformIndex);
+	///set the path (directory/folder) where the compiled OpenCL kernel are stored
+	void b3OpenCLUtils_setCachePath(const char* path);
+
+	cl_context b3OpenCLUtils_createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex);

 #ifdef __cplusplus
 }
@@ -71,37 +70,35 @@ typedef struct
 	char m_driverVersion[B3_MAX_STRING_LENGTH];
 	char m_deviceExtensions[B3_MAX_STRING_LENGTH];

-	cl_device_type		m_deviceType;
-	cl_uint 				m_computeUnits;
-	size_t 					m_workitemDims;
-	size_t 					m_workItemSize[3];
-	size_t 					m_image2dMaxWidth;
-	size_t 					m_image2dMaxHeight;
-	size_t 					m_image3dMaxWidth;
-	size_t 					m_image3dMaxHeight;
-	size_t 					m_image3dMaxDepth;
-	size_t 					m_workgroupSize;
-	cl_uint 				m_clockFrequency;
-	cl_ulong				m_constantBufferSize;
-	cl_ulong				m_localMemSize;
-	cl_ulong				m_globalMemSize;
-    cl_bool					m_errorCorrectionSupport;
+	cl_device_type m_deviceType;
+	cl_uint m_computeUnits;
+	size_t m_workitemDims;
+	size_t m_workItemSize[3];
+	size_t m_image2dMaxWidth;
+	size_t m_image2dMaxHeight;
+	size_t m_image3dMaxWidth;
+	size_t m_image3dMaxHeight;
+	size_t m_image3dMaxDepth;
+	size_t m_workgroupSize;
+	cl_uint m_clockFrequency;
+	cl_ulong m_constantBufferSize;
+	cl_ulong m_localMemSize;
+	cl_ulong m_globalMemSize;
+	cl_bool m_errorCorrectionSupport;
 	cl_device_local_mem_type m_localMemType;
-	cl_uint					m_maxReadImageArgs;
-	cl_uint					m_maxWriteImageArgs;
+	cl_uint m_maxReadImageArgs;
+	cl_uint m_maxWriteImageArgs;

-
-
-	cl_uint 				m_addressBits;
-	cl_ulong				m_maxMemAllocSize;
+	cl_uint m_addressBits;
+	cl_ulong m_maxMemAllocSize;
 	cl_command_queue_properties m_queueProperties;
-	cl_bool					m_imageSupport;
-	cl_uint					m_vecWidthChar;
-	cl_uint					m_vecWidthShort;
-	cl_uint					m_vecWidthInt;
-	cl_uint					m_vecWidthLong;
-	cl_uint					m_vecWidthFloat;
-	cl_uint					m_vecWidthDouble;
+	cl_bool m_imageSupport;
+	cl_uint m_vecWidthChar;
+	cl_uint m_vecWidthShort;
+	cl_uint m_vecWidthInt;
+	cl_uint m_vecWidthLong;
+	cl_uint m_vecWidthFloat;
+	cl_uint m_vecWidthDouble;

 } b3OpenCLDeviceInfo;

@@ -110,33 +107,32 @@ struct b3OpenCLPlatformInfo
 	char m_platformVendor[B3_MAX_STRING_LENGTH];
 	char m_platformName[B3_MAX_STRING_LENGTH];
 	char m_platformVersion[B3_MAX_STRING_LENGTH];
-	
+
 	b3OpenCLPlatformInfo()
 	{
-		m_platformVendor[0]=0;
-		m_platformName[0]=0;
-		m_platformVersion[0]=0;
+		m_platformVendor[0] = 0;
+		m_platformName[0] = 0;
+		m_platformVersion[0] = 0;
 	}
 };

-
 ///C++ API for OpenCL utilities: convenience functions
 struct b3OpenCLUtils
 {
 	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
 	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
-	static inline cl_context 	createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1, cl_platform_id* platformId=0)
+	static inline cl_context createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1, cl_platform_id* platformId = 0)
 	{
-		return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx , pGLDC , preferredDeviceIndex, preferredPlatformIndex, platformId);
+		return b3OpenCLUtils_createContextFromType(deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex, platformId);
 	}
-	
+
 	static inline int getNumDevices(cl_context cxMainContext)
 	{
 		return b3OpenCLUtils_getNumDevices(cxMainContext);
 	}
 	static inline cl_device_id getDevice(cl_context cxMainContext, int nr)
 	{
-		return b3OpenCLUtils_getDevice(cxMainContext,nr);
+		return b3OpenCLUtils_getDevice(cxMainContext, nr);
 	}

 	static void getDeviceInfo(cl_device_id device, b3OpenCLDeviceInfo* info);
@@ -146,28 +142,28 @@ struct b3OpenCLUtils
 		b3OpenCLUtils_printDeviceInfo(device);
 	}

-	static inline cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" )
+	static inline cl_kernel compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum = 0, cl_program prog = 0, const char* additionalMacros = "")
 	{
-		return b3OpenCLUtils_compileCLKernelFromString(clContext,device, kernelSource,  kernelName, pErrNum, prog,additionalMacros);
+		return b3OpenCLUtils_compileCLKernelFromString(clContext, device, kernelSource, kernelName, pErrNum, prog, additionalMacros);
 	}

 	//optional
-	static inline cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0, bool disableBinaryCaching=false)
+	static inline cl_program compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum = 0, const char* additionalMacros = "", const char* srcFileNameForCaching = 0, bool disableBinaryCaching = false)
 	{
-		return b3OpenCLUtils_compileCLProgramFromString(clContext,device, kernelSource, pErrNum,additionalMacros, srcFileNameForCaching, disableBinaryCaching);
+		return b3OpenCLUtils_compileCLProgramFromString(clContext, device, kernelSource, pErrNum, additionalMacros, srcFileNameForCaching, disableBinaryCaching);
 	}

 	//the following optional APIs provide access using specific platform information
-	static inline int getNumPlatforms(cl_int* pErrNum=0)
+	static inline int getNumPlatforms(cl_int* pErrNum = 0)
 	{
 		return b3OpenCLUtils_getNumPlatforms(pErrNum);
 	}
 	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
-	static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum=0)
+	static inline cl_platform_id getPlatform(int nr, cl_int* pErrNum = 0)
 	{
-		return b3OpenCLUtils_getPlatform(nr,pErrNum);
+		return b3OpenCLUtils_getPlatform(nr, pErrNum);
 	}
-	
+
 	static void getPlatformInfo(cl_platform_id platform, b3OpenCLPlatformInfo* platformInfo);

 	static inline void printPlatformInfo(cl_platform_id platform)
@@ -179,9 +175,9 @@ struct b3OpenCLUtils
 	{
 		return b3OpenCLUtils_getSdkVendorName();
 	}
-	static inline cl_context 	createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1)
+	static inline cl_context createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex = -1)
 	{
-		return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx,pGLDC,preferredDeviceIndex, preferredPlatformIndex);
+		return b3OpenCLUtils_createContextFromPlatform(platform, deviceType, pErrNum, pGLCtx, pGLDC, preferredDeviceIndex, preferredPlatformIndex);
 	}
 	static void setCachePath(const char* path)
 	{
@@ -189,6 +185,6 @@ struct b3OpenCLUtils
 	}
 };

-#endif //__cplusplus
+#endif  //__cplusplus

-#endif // B3_OPENCL_UTILS_H
+#endif  // B3_OPENCL_UTILS_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3BvhInfo.h
@@ -5,14 +5,13 @@

 struct b3BvhInfo
 {
-	b3Vector3	m_aabbMin;
-	b3Vector3	m_aabbMax;
-	b3Vector3	m_quantization;
-	int			m_numNodes;
-	int			m_numSubTrees;
-	int			m_nodeOffset;
-	int			m_subTreeOffset;
-
+	b3Vector3 m_aabbMin;
+	b3Vector3 m_aabbMax;
+	b3Vector3 m_quantization;
+	int m_numNodes;
+	int m_numSubTrees;
+	int m_nodeOffset;
+	int m_subTreeOffset;
 };

-#endif //B3_BVH_INFO_H
+#endif  //B3_BVH_INFO_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.cpp
@@ -15,7 +15,6 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-
 #include "b3ContactCache.h"
 #include "Bullet3Common/b3Transform.h"

@@ -69,7 +68,7 @@ int b3ContactCache::sortCachedPoints(const b3Vector3& pt)
 				maxPenetration = m_pointCache[i].getDistance();
 			}
 		}
-#endif //KEEP_DEEPEST_POINT
+#endif  //KEEP_DEEPEST_POINT
 		
 		b3Scalar res0(b3Scalar(0.)),res1(b3Scalar(0.)),res2(b3Scalar(0.)),res3(b3Scalar(0.));

@@ -251,8 +250,4 @@ void b3ContactCache::refreshContactPoints(const b3Transform& trA,const b3Transfo

 }

-
-
-
-
 #endif
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ContactCache.h
@@ -17,17 +17,13 @@ subject to the following restrictions:
 #ifndef B3_CONTACT_CACHE_H
 #define B3_CONTACT_CACHE_H

-
 #include "Bullet3Common/b3Vector3.h"
 #include "Bullet3Common/b3Transform.h"
 #include "Bullet3Common/b3AlignedAllocator.h"

-
 ///maximum contact breaking and merging threshold
 extern b3Scalar gContactBreakingThreshold;

-
-
 #define MANIFOLD_CACHE_SIZE 4

 ///b3ContactCache is a contact point cache, it stays persistent as long as objects are overlapping in the broadphase.
@@ -37,24 +33,16 @@ extern b3Scalar gContactBreakingThreshold;
 ///reduces the cache to 4 points, when more then 4 points are added, using following rules:
 ///the contact point with deepest penetration is always kept, and it tries to maximuze the area covered by the points
 ///note that some pairs of objects might have more then one contact manifold.
-B3_ATTRIBUTE_ALIGNED16( class) b3ContactCache
+B3_ATTRIBUTE_ALIGNED16(class)
+b3ContactCache
 {
-
-	
-
-	
 	/// sort cached points so most isolated points come first
-	int	sortCachedPoints(const b3Vector3& pt);
-
-	
+	int sortCachedPoints(const b3Vector3& pt);

 public:
-
 	B3_DECLARE_ALIGNED_ALLOCATOR();

-	
-	
-	int addManifoldPoint( const b3Vector3& newPoint);
+	int addManifoldPoint(const b3Vector3& newPoint);

 	/*void replaceContactPoint(const b3Vector3& newPoint,int insertIndex)
 	{
@@ -63,18 +51,12 @@ public:
 	}
 	*/

-
-	
 	static bool validContactDistance(const b3Vector3& pt);
-	
+
 	/// calculated new worldspace coordinates and depth, and reject points that exceed the collision margin
-	static void	refreshContactPoints(  const b3Transform& trA,const b3Transform& trB, struct b3Contact4Data& newContactCache);
-
-	static void removeContactPoint(struct b3Contact4Data& newContactCache,int i);
-	
+	static void refreshContactPoints(const b3Transform& trA, const b3Transform& trB, struct b3Contact4Data& newContactCache);

+	static void removeContactPoint(struct b3Contact4Data & newContactCache, int i);
 };

-
-
-#endif //B3_CONTACT_CACHE_H
+#endif  //B3_CONTACT_CACHE_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
@@ -17,102 +17,90 @@

 //#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"

-
-
-
 struct GpuSatCollision
 {
-	cl_context				m_context;
-	cl_device_id			m_device;
-	cl_command_queue		m_queue;
-	cl_kernel				m_findSeparatingAxisKernel;
-	cl_kernel				m_mprPenetrationKernel;
-	cl_kernel				m_findSeparatingAxisUnitSphereKernel;
-
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+	cl_kernel m_findSeparatingAxisKernel;
+	cl_kernel m_mprPenetrationKernel;
+	cl_kernel m_findSeparatingAxisUnitSphereKernel;

 	cl_kernel m_findSeparatingAxisVertexFaceKernel;
 	cl_kernel m_findSeparatingAxisEdgeEdgeKernel;
-	
-	cl_kernel				m_findConcaveSeparatingAxisKernel;
-    cl_kernel				m_findConcaveSeparatingAxisVertexFaceKernel;
-    cl_kernel				m_findConcaveSeparatingAxisEdgeEdgeKernel;
- 
-    
-    
-    
-	cl_kernel				m_findCompoundPairsKernel;
-	cl_kernel				m_processCompoundPairsKernel;

-	cl_kernel				m_clipHullHullKernel;
-	cl_kernel				m_clipCompoundsHullHullKernel;
-    
-    cl_kernel               m_clipFacesAndFindContacts;
-    cl_kernel               m_findClippingFacesKernel;
-    
-	cl_kernel				m_clipHullHullConcaveConvexKernel;
-//	cl_kernel				m_extractManifoldAndAddContactKernel;
-    cl_kernel               m_newContactReductionKernel;
+	cl_kernel m_findConcaveSeparatingAxisKernel;
+	cl_kernel m_findConcaveSeparatingAxisVertexFaceKernel;
+	cl_kernel m_findConcaveSeparatingAxisEdgeEdgeKernel;

-	cl_kernel				m_bvhTraversalKernel;
-	cl_kernel				m_primitiveContactsKernel;
-	cl_kernel				m_findConcaveSphereContactsKernel;
+	cl_kernel m_findCompoundPairsKernel;
+	cl_kernel m_processCompoundPairsKernel;
+
+	cl_kernel m_clipHullHullKernel;
+	cl_kernel m_clipCompoundsHullHullKernel;
+
+	cl_kernel m_clipFacesAndFindContacts;
+	cl_kernel m_findClippingFacesKernel;
+
+	cl_kernel m_clipHullHullConcaveConvexKernel;
+	//	cl_kernel				m_extractManifoldAndAddContactKernel;
+	cl_kernel m_newContactReductionKernel;
+
+	cl_kernel m_bvhTraversalKernel;
+	cl_kernel m_primitiveContactsKernel;
+	cl_kernel m_findConcaveSphereContactsKernel;
+
+	cl_kernel m_processCompoundPairsPrimitivesKernel;

-	cl_kernel				m_processCompoundPairsPrimitivesKernel;
-    
 	b3OpenCLArray<b3Vector3> m_unitSphereDirections;

-	b3OpenCLArray<int>		m_totalContactsOut;
+	b3OpenCLArray<int> m_totalContactsOut;

 	b3OpenCLArray<b3Vector3> m_sepNormals;
 	b3OpenCLArray<float> m_dmins;

-	b3OpenCLArray<int>		m_hasSeparatingNormals;
+	b3OpenCLArray<int> m_hasSeparatingNormals;
 	b3OpenCLArray<b3Vector3> m_concaveSepNormals;
-	b3OpenCLArray<int>		m_concaveHasSeparatingNormals;
-	b3OpenCLArray<int>		m_numConcavePairsOut;
+	b3OpenCLArray<int> m_concaveHasSeparatingNormals;
+	b3OpenCLArray<int> m_numConcavePairsOut;
 	b3OpenCLArray<b3CompoundOverlappingPair> m_gpuCompoundPairs;
 	b3OpenCLArray<b3Vector3> m_gpuCompoundSepNormals;
-	b3OpenCLArray<int>		m_gpuHasCompoundSepNormals;
-	b3OpenCLArray<int>		m_numCompoundPairsOut;
-	
+	b3OpenCLArray<int> m_gpuHasCompoundSepNormals;
+	b3OpenCLArray<int> m_numCompoundPairsOut;

-	GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	GpuSatCollision(cl_context ctx, cl_device_id device, cl_command_queue q);
 	virtual ~GpuSatCollision();
-	

-	void computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>* pairs, int nPairs, 
-			const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
-			b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
-			const b3OpenCLArray<b3Contact4>* oldContacts,
-			int maxContactCapacity,
-			int compoundPairCapacity,
-			const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
-			const b3OpenCLArray<b3Vector3>& vertices,
-			const b3OpenCLArray<b3Vector3>& uniqueEdges,
-			const b3OpenCLArray<b3GpuFace>& faces,
-			const b3OpenCLArray<int>& indices,
-			const b3OpenCLArray<b3Collidable>& gpuCollidables,
-			const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,
-
-			const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
-			const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,
-
-           b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
-           b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
-           b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
-           b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
-           b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
-		   b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
-		   b3OpenCLArray<b3QuantizedBvhNode>*	treeNodesGPU,
-			b3OpenCLArray<b3BvhSubtreeInfo>*	subTreesGPU,
-			b3OpenCLArray<b3BvhInfo>*	bvhInfo,
-			int numObjects,
-			int maxTriConvexPairCapacity,
-			b3OpenCLArray<b3Int4>& triangleConvexPairs,
-			int& numTriConvexPairsOut
-			);
+	void computeConvexConvexContactsGPUSAT(b3OpenCLArray<b3Int4>* pairs, int nPairs,
+										   const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+										   b3OpenCLArray<b3Contact4>* contactOut, int& nContacts,
+										   const b3OpenCLArray<b3Contact4>* oldContacts,
+										   int maxContactCapacity,
+										   int compoundPairCapacity,
+										   const b3OpenCLArray<b3ConvexPolyhedronData>& hostConvexData,
+										   const b3OpenCLArray<b3Vector3>& vertices,
+										   const b3OpenCLArray<b3Vector3>& uniqueEdges,
+										   const b3OpenCLArray<b3GpuFace>& faces,
+										   const b3OpenCLArray<int>& indices,
+										   const b3OpenCLArray<b3Collidable>& gpuCollidables,
+										   const b3OpenCLArray<b3GpuChildShape>& gpuChildShapes,

+										   const b3OpenCLArray<b3Aabb>& clAabbsWorldSpace,
+										   const b3OpenCLArray<b3Aabb>& clAabbsLocalSpace,

+										   b3OpenCLArray<b3Vector3>& worldVertsB1GPU,
+										   b3OpenCLArray<b3Int4>& clippingFacesOutGPU,
+										   b3OpenCLArray<b3Vector3>& worldNormalsAGPU,
+										   b3OpenCLArray<b3Vector3>& worldVertsA1GPU,
+										   b3OpenCLArray<b3Vector3>& worldVertsB2GPU,
+										   b3AlignedObjectArray<class b3OptimizedBvh*>& bvhData,
+										   b3OpenCLArray<b3QuantizedBvhNode>* treeNodesGPU,
+										   b3OpenCLArray<b3BvhSubtreeInfo>* subTreesGPU,
+										   b3OpenCLArray<b3BvhInfo>* bvhInfo,
+										   int numObjects,
+										   int maxTriConvexPairCapacity,
+										   b3OpenCLArray<b3Int4>& triangleConvexPairs,
+										   int& numTriConvexPairsOut);
 };

-#endif //_CONVEX_HULL_CONTACT_H
+#endif  //_CONVEX_HULL_CONTACT_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexPolyhedronCL.h
@@ -4,6 +4,4 @@
 #include "Bullet3Common/b3Transform.h"
 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"

-
-
-#endif //CONVEX_POLYHEDRON_CL
+#endif  //CONVEX_POLYHEDRON_CL
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3GjkEpa.h
@@ -29,40 +29,39 @@ GJK-EPA collision solver by Nathanael Presson, 2008
 #include "Bullet3Common/b3Transform.h"
 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3ConvexPolyhedronData.h"

-
 ///btGjkEpaSolver contributed under zlib by Nathanael Presson
-struct	b3GjkEpaSolver2
+struct b3GjkEpaSolver2
 {
-struct	sResults
+	struct sResults
 	{
-	enum eStatus
+		enum eStatus
 		{
-		Separated,		/* Shapes doesnt penetrate												*/ 
-		Penetrating,	/* Shapes are penetrating												*/ 
-		GJK_Failed,		/* GJK phase fail, no big issue, shapes are probably just 'touching'	*/ 
-		EPA_Failed		/* EPA phase fail, bigger problem, need to save parameters, and debug	*/ 
-		}		status;
-	b3Vector3	witnesses[2];
-	b3Vector3	normal;
-	b3Scalar	distance;
+			Separated,   /* Shapes doesnt penetrate												*/
+			Penetrating, /* Shapes are penetrating												*/
+			GJK_Failed,  /* GJK phase fail, no big issue, shapes are probably just 'touching'	*/
+			EPA_Failed   /* EPA phase fail, bigger problem, need to save parameters, and debug	*/
+		} status;
+		b3Vector3 witnesses[2];
+		b3Vector3 normal;
+		b3Scalar distance;
 	};

-static int		StackSizeRequirement();
+	static int StackSizeRequirement();

-static bool		Distance(	 const b3Transform&	transA, const b3Transform&	transB,
-							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
-							const b3AlignedObjectArray<b3Vector3>& verticesA,
-							const b3AlignedObjectArray<b3Vector3>& verticesB,
-							const b3Vector3& guess,
-							sResults& results);
+	static bool Distance(const b3Transform& transA, const b3Transform& transB,
+						 const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
+						 const b3AlignedObjectArray<b3Vector3>& verticesA,
+						 const b3AlignedObjectArray<b3Vector3>& verticesB,
+						 const b3Vector3& guess,
+						 sResults& results);

-static bool		Penetration( const b3Transform&	transA, const b3Transform&	transB,
-							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB, 
+	static bool Penetration(const b3Transform& transA, const b3Transform& transB,
+							const b3ConvexPolyhedronData* hullA, const b3ConvexPolyhedronData* hullB,
 							const b3AlignedObjectArray<b3Vector3>& verticesA,
 							const b3AlignedObjectArray<b3Vector3>& verticesB,
 							const b3Vector3& guess,
 							sResults& results,
-							bool usemargins=true);
+							bool usemargins = true);
 #if 0
 static b3Scalar	SignedDistance(	const b3Vector3& position,
 								b3Scalar margin,
@@ -74,9 +73,7 @@ static bool		SignedDistance(	const btConvexShape* shape0,const btTransform& wtrs
 								const btConvexShape* shape1,const btTransform& wtrs1,
 								const b3Vector3& guess,
 								sResults& results);
-#endif 
-
+#endif
 };

-#endif //B3_GJK_EPA2_H
-
+#endif  //B3_GJK_EPA2_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.cpp
@@ -13,50 +13,45 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-
 #include "b3OptimizedBvh.h"
 #include "b3StridingMeshInterface.h"
 #include "Bullet3Geometry/b3AabbUtil.h"

-
 b3OptimizedBvh::b3OptimizedBvh()
-{ 
+{
 }

 b3OptimizedBvh::~b3OptimizedBvh()
 {
 }

-
 void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax)
 {
 	m_useQuantization = useQuantizedAabbCompression;

-
 	// NodeArray	triangleNodes;

-	struct	NodeTriangleCallback : public b3InternalTriangleIndexCallback
+	struct NodeTriangleCallback : public b3InternalTriangleIndexCallback
 	{
-
-		NodeArray&	m_triangleNodes;
+		NodeArray& m_triangleNodes;

 		NodeTriangleCallback& operator=(NodeTriangleCallback& other)
 		{
 			m_triangleNodes.copyFromArray(other.m_triangleNodes);
 			return *this;
 		}
-		
-		NodeTriangleCallback(NodeArray&	triangleNodes)
-			:m_triangleNodes(triangleNodes)
+
+		NodeTriangleCallback(NodeArray& triangleNodes)
+			: m_triangleNodes(triangleNodes)
 		{
 		}

-		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
 		{
 			b3OptimizedBvhNode node;
-			b3Vector3	aabbMin,aabbMax;
-			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
-			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+			b3Vector3 aabbMin, aabbMax;
+			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
+			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
 			aabbMin.setMin(triangle[0]);
 			aabbMax.setMax(triangle[0]);
 			aabbMin.setMin(triangle[1]);
@@ -69,17 +64,17 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
 			node.m_aabbMaxOrg = aabbMax;

 			node.m_escapeIndex = -1;
-	
+
 			//for child nodes
 			node.m_subPart = partId;
 			node.m_triangleIndex = triangleIndex;
 			m_triangleNodes.push_back(node);
 		}
 	};
-	struct	QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
+	struct QuantizedNodeTriangleCallback : public b3InternalTriangleIndexCallback
 	{
-		QuantizedNodeArray&	m_triangleNodes;
-		const b3QuantizedBvh* m_optimizedTree; // for quantization
+		QuantizedNodeArray& m_triangleNodes;
+		const b3QuantizedBvh* m_optimizedTree;  // for quantization

 		QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
 		{
@@ -88,23 +83,23 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
 			return *this;
 		}

-		QuantizedNodeTriangleCallback(QuantizedNodeArray&	triangleNodes,const b3QuantizedBvh* tree)
-			:m_triangleNodes(triangleNodes),m_optimizedTree(tree)
+		QuantizedNodeTriangleCallback(QuantizedNodeArray& triangleNodes, const b3QuantizedBvh* tree)
+			: m_triangleNodes(triangleNodes), m_optimizedTree(tree)
 		{
 		}

-		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
 		{
 			// The partId and triangle index must fit in the same (positive) integer
-			b3Assert(partId < (1<<MAX_NUM_PARTS_IN_BITS));
-			b3Assert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS)));
+			b3Assert(partId < (1 << MAX_NUM_PARTS_IN_BITS));
+			b3Assert(triangleIndex < (1 << (31 - MAX_NUM_PARTS_IN_BITS)));
 			//negative indices are reserved for escapeIndex
-			b3Assert(triangleIndex>=0);
+			b3Assert(triangleIndex >= 0);

 			b3QuantizedBvhNode node;
-			b3Vector3	aabbMin,aabbMax;
-			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
-			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
+			b3Vector3 aabbMin, aabbMax;
+			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
+			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
 			aabbMin.setMin(triangle[0]);
 			aabbMax.setMax(triangle[0]);
 			aabbMin.setMin(triangle[1]);
@@ -131,59 +126,52 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
 				aabbMin.setZ(aabbMin.getZ() - MIN_AABB_HALF_DIMENSION);
 			}

-			m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0);
-			m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1);
+			m_optimizedTree->quantize(&node.m_quantizedAabbMin[0], aabbMin, 0);
+			m_optimizedTree->quantize(&node.m_quantizedAabbMax[0], aabbMax, 1);

-			node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
+			node.m_escapeIndexOrTriangleIndex = (partId << (31 - MAX_NUM_PARTS_IN_BITS)) | triangleIndex;

 			m_triangleNodes.push_back(node);
 		}
 	};
-	
-

 	int numLeafNodes = 0;

-	
 	if (m_useQuantization)
 	{
-
 		//initialize quantization values
-		setQuantizationValues(bvhAabbMin,bvhAabbMax);
+		setQuantizationValues(bvhAabbMin, bvhAabbMax);

-		QuantizedNodeTriangleCallback	callback(m_quantizedLeafNodes,this);
+		QuantizedNodeTriangleCallback callback(m_quantizedLeafNodes, this);

-	
-		triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax);
+		triangles->InternalProcessAllTriangles(&callback, m_bvhAabbMin, m_bvhAabbMax);

 		//now we have an array of leafnodes in m_leafNodes
 		numLeafNodes = m_quantizedLeafNodes.size();

-
-		m_quantizedContiguousNodes.resize(2*numLeafNodes);
-
-
-	} else
+		m_quantizedContiguousNodes.resize(2 * numLeafNodes);
+	}
+	else
 	{
-		NodeTriangleCallback	callback(m_leafNodes);
+		NodeTriangleCallback callback(m_leafNodes);

-		b3Vector3 aabbMin=b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
-		b3Vector3 aabbMax=b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+		b3Vector3 aabbMin = b3MakeVector3(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
+		b3Vector3 aabbMax = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));

-		triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax);
+		triangles->InternalProcessAllTriangles(&callback, aabbMin, aabbMax);

 		//now we have an array of leafnodes in m_leafNodes
 		numLeafNodes = m_leafNodes.size();

-		m_contiguousNodes.resize(2*numLeafNodes);
+		m_contiguousNodes.resize(2 * numLeafNodes);
 	}

 	m_curNodeIndex = 0;

-	buildTree(0,numLeafNodes);
+	buildTree(0, numLeafNodes);

 	///if the entire tree is small then subtree size, we need to create a header info for the tree
-	if(m_useQuantization && !m_SubtreeHeaders.size())
+	if (m_useQuantization && !m_SubtreeHeaders.size())
 	{
 		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
 		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
@@ -199,37 +187,29 @@ void b3OptimizedBvh::build(b3StridingMeshInterface* triangles, bool useQuantized
 	m_leafNodes.clear();
 }

-
-
-
-void	b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
+void b3OptimizedBvh::refit(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
 {
 	if (m_useQuantization)
 	{
+		setQuantizationValues(aabbMin, aabbMax);

-		setQuantizationValues(aabbMin,aabbMax);
-
-		updateBvhNodes(meshInterface,0,m_curNodeIndex,0);
+		updateBvhNodes(meshInterface, 0, m_curNodeIndex, 0);

 		///now update all subtree headers

 		int i;
-		for (i=0;i<m_SubtreeHeaders.size();i++)
+		for (i = 0; i < m_SubtreeHeaders.size(); i++)
 		{
 			b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
 			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
 		}
-
-	} else
+	}
+	else
 	{
-
 	}
 }

-
-
-
-void	b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b3Vector3& aabbMin,const b3Vector3& aabbMax)
+void b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface, const b3Vector3& aabbMin, const b3Vector3& aabbMax)
 {
 	//incrementally initialize quantization values
 	b3Assert(m_useQuantization);
@@ -244,147 +224,135 @@ void	b3OptimizedBvh::refitPartial(b3StridingMeshInterface* meshInterface,const b

 	///we should update all quantization values, using updateBvhNodes(meshInterface);
 	///but we only update chunks that overlap the given aabb
-	
-	unsigned short	quantizedQueryAabbMin[3];
-	unsigned short	quantizedQueryAabbMax[3];

-	quantize(&quantizedQueryAabbMin[0],aabbMin,0);
-	quantize(&quantizedQueryAabbMax[0],aabbMax,1);
+	unsigned short quantizedQueryAabbMin[3];
+	unsigned short quantizedQueryAabbMax[3];
+
+	quantize(&quantizedQueryAabbMin[0], aabbMin, 0);
+	quantize(&quantizedQueryAabbMax[0], aabbMax, 1);

 	int i;
-	for (i=0;i<this->m_SubtreeHeaders.size();i++)
+	for (i = 0; i < this->m_SubtreeHeaders.size(); i++)
 	{
 		b3BvhSubtreeInfo& subtree = m_SubtreeHeaders[i];

 		//PCK: unsigned instead of bool
-		unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		unsigned overlap = b3TestQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin, quantizedQueryAabbMax, subtree.m_quantizedAabbMin, subtree.m_quantizedAabbMax);
 		if (overlap != 0)
 		{
-			updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i);
+			updateBvhNodes(meshInterface, subtree.m_rootNodeIndex, subtree.m_rootNodeIndex + subtree.m_subtreeSize, i);

 			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
 		}
 	}
-	
 }

-void	b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index)
+void b3OptimizedBvh::updateBvhNodes(b3StridingMeshInterface* meshInterface, int firstNode, int endNode, int index)
 {
 	(void)index;

 	b3Assert(m_useQuantization);

-	int curNodeSubPart=-1;
+	int curNodeSubPart = -1;

 	//get access info to trianglemesh data
-		const unsigned char *vertexbase = 0;
-		int numverts = 0;
-		PHY_ScalarType type = PHY_INTEGER;
-		int stride = 0;
-		const unsigned char *indexbase = 0;
-		int indexstride = 0;
-		int numfaces = 0;
-		PHY_ScalarType indicestype = PHY_INTEGER;
+	const unsigned char* vertexbase = 0;
+	int numverts = 0;
+	PHY_ScalarType type = PHY_INTEGER;
+	int stride = 0;
+	const unsigned char* indexbase = 0;
+	int indexstride = 0;
+	int numfaces = 0;
+	PHY_ScalarType indicestype = PHY_INTEGER;

-		b3Vector3	triangleVerts[3];
-		b3Vector3	aabbMin,aabbMax;
-		const b3Vector3& meshScaling = meshInterface->getScaling();
-		
-		int i;
-		for (i=endNode-1;i>=firstNode;i--)
+	b3Vector3 triangleVerts[3];
+	b3Vector3 aabbMin, aabbMax;
+	const b3Vector3& meshScaling = meshInterface->getScaling();
+
+	int i;
+	for (i = endNode - 1; i >= firstNode; i--)
+	{
+		b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
+		if (curNode.isLeafNode())
 		{
-
-
-			b3QuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
-			if (curNode.isLeafNode())
+			//recalc aabb from triangle data
+			int nodeSubPart = curNode.getPartId();
+			int nodeTriangleIndex = curNode.getTriangleIndex();
+			if (nodeSubPart != curNodeSubPart)
 			{
-				//recalc aabb from triangle data
-				int nodeSubPart = curNode.getPartId();
-				int nodeTriangleIndex = curNode.getTriangleIndex();
-				if (nodeSubPart != curNodeSubPart)
-				{
-					if (curNodeSubPart >= 0)
-						meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
-					meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,	type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart);
+				if (curNodeSubPart >= 0)
+					meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+				meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numfaces, indicestype, nodeSubPart);

-					curNodeSubPart = nodeSubPart;
-					b3Assert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT);
-				}
-				//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
+				curNodeSubPart = nodeSubPart;
+				b3Assert(indicestype == PHY_INTEGER || indicestype == PHY_SHORT);
+			}
+			//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,

-				unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride);
-				
-				
-				for (int j=2;j>=0;j--)
-				{
-					
-					int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j];
-					if (type == PHY_FLOAT)
-					{
-						float* graphicsbase = (float*)(vertexbase+graphicsindex*stride);
-						triangleVerts[j] = b3MakeVector3(
-							graphicsbase[0]*meshScaling.getX(),
-							graphicsbase[1]*meshScaling.getY(),
-							graphicsbase[2]*meshScaling.getZ());
-					}
-					else
-					{
-						double* graphicsbase = (double*)(vertexbase+graphicsindex*stride);
-						triangleVerts[j] = b3MakeVector3( b3Scalar(graphicsbase[0]*meshScaling.getX()), b3Scalar(graphicsbase[1]*meshScaling.getY()), b3Scalar(graphicsbase[2]*meshScaling.getZ()));
-					}
-				}
+			unsigned int* gfxbase = (unsigned int*)(indexbase + nodeTriangleIndex * indexstride);

-
-				
-				aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
-				aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT)); 
-				aabbMin.setMin(triangleVerts[0]);
-				aabbMax.setMax(triangleVerts[0]);
-				aabbMin.setMin(triangleVerts[1]);
-				aabbMax.setMax(triangleVerts[1]);
-				aabbMin.setMin(triangleVerts[2]);
-				aabbMax.setMax(triangleVerts[2]);
-
-				quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0);
-				quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1);
-				
-			} else
+			for (int j = 2; j >= 0; j--)
 			{
-				//combine aabb from both children
-
-				b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1];
-				
-				b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] :
-					&m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()];
-				
-
+				int graphicsindex = indicestype == PHY_SHORT ? ((unsigned short*)gfxbase)[j] : gfxbase[j];
+				if (type == PHY_FLOAT)
 				{
-					for (int i=0;i<3;i++)
-					{
-						curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
-						if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i])
-							curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i];
-
-						curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
-						if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
-							curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
-					}
+					float* graphicsbase = (float*)(vertexbase + graphicsindex * stride);
+					triangleVerts[j] = b3MakeVector3(
+						graphicsbase[0] * meshScaling.getX(),
+						graphicsbase[1] * meshScaling.getY(),
+						graphicsbase[2] * meshScaling.getZ());
+				}
+				else
+				{
+					double* graphicsbase = (double*)(vertexbase + graphicsindex * stride);
+					triangleVerts[j] = b3MakeVector3(b3Scalar(graphicsbase[0] * meshScaling.getX()), b3Scalar(graphicsbase[1] * meshScaling.getY()), b3Scalar(graphicsbase[2] * meshScaling.getZ()));
 				}
 			}

+			aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
+			aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
+			aabbMin.setMin(triangleVerts[0]);
+			aabbMax.setMax(triangleVerts[0]);
+			aabbMin.setMin(triangleVerts[1]);
+			aabbMax.setMax(triangleVerts[1]);
+			aabbMin.setMin(triangleVerts[2]);
+			aabbMax.setMax(triangleVerts[2]);
+
+			quantize(&curNode.m_quantizedAabbMin[0], aabbMin, 0);
+			quantize(&curNode.m_quantizedAabbMax[0], aabbMax, 1);
 		}
+		else
+		{
+			//combine aabb from both children

-		if (curNodeSubPart >= 0)
-			meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+			b3QuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i + 1];

-		
+			b3QuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i + 2] : &m_quantizedContiguousNodes[i + 1 + leftChildNode->getEscapeIndex()];
+
+			{
+				for (int i = 0; i < 3; i++)
+				{
+					curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
+					if (curNode.m_quantizedAabbMin[i] > rightChildNode->m_quantizedAabbMin[i])
+						curNode.m_quantizedAabbMin[i] = rightChildNode->m_quantizedAabbMin[i];
+
+					curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
+					if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
+						curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
+				}
+			}
+		}
+	}
+
+	if (curNodeSubPart >= 0)
+		meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
 }

 ///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
-b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
+b3OptimizedBvh* b3OptimizedBvh::deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
 {
-	b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
-	
+	b3QuantizedBvh* bvh = b3QuantizedBvh::deSerializeInPlace(i_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
+
 	//we don't add additional data so just do a static upcast
 	return static_cast<b3OptimizedBvh*>(bvh);
 }
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3OptimizedBvh.h
@@ -22,44 +22,35 @@ subject to the following restrictions:

 class b3StridingMeshInterface;

-
 ///The b3OptimizedBvh extends the b3QuantizedBvh to create AABB tree for triangle meshes, through the b3StridingMeshInterface.
-B3_ATTRIBUTE_ALIGNED16(class) b3OptimizedBvh : public b3QuantizedBvh
+B3_ATTRIBUTE_ALIGNED16(class)
+b3OptimizedBvh : public b3QuantizedBvh
 {
-	
 public:
 	B3_DECLARE_ALIGNED_ALLOCATOR();

 protected:
-
 public:
-
 	b3OptimizedBvh();

 	virtual ~b3OptimizedBvh();

-	void	build(b3StridingMeshInterface* triangles,bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);
+	void build(b3StridingMeshInterface * triangles, bool useQuantizedAabbCompression, const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax);

-	void	refit(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin,const b3Vector3& aabbMax);
+	void refit(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);

-	void	refitPartial(b3StridingMeshInterface* triangles,const b3Vector3& aabbMin, const b3Vector3& aabbMax);
+	void refitPartial(b3StridingMeshInterface * triangles, const b3Vector3& aabbMin, const b3Vector3& aabbMax);

-	void	updateBvhNodes(b3StridingMeshInterface* meshInterface,int firstNode,int endNode,int index);
+	void updateBvhNodes(b3StridingMeshInterface * meshInterface, int firstNode, int endNode, int index);

 	/// Data buffer MUST be 16 byte aligned
-	virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
+	virtual bool serializeInPlace(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
 	{
-		return b3QuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
-
+		return b3QuantizedBvh::serialize(o_alignedDataBuffer, i_dataBufferSize, i_swapEndian);
 	}

 	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
-	static b3OptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
-
-
+	static b3OptimizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
 };

-
-#endif //B3_OPTIMIZED_BVH_H
-
-
+#endif  //B3_OPTIMIZED_BVH_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.cpp
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3QuantizedBvh.h
@@ -22,11 +22,11 @@ class b3Serializer;
 #ifdef DEBUG_CHECK_DEQUANTIZATION
 #ifdef __SPU__
 #define printf spu_printf
-#endif //__SPU__
+#endif  //__SPU__

 #include <stdio.h>
 #include <stdlib.h>
-#endif //DEBUG_CHECK_DEQUANTIZATION
+#endif  //DEBUG_CHECK_DEQUANTIZATION

 #include "Bullet3Common/b3Vector3.h"
 #include "Bullet3Common/b3AlignedAllocator.h"
@@ -44,13 +44,10 @@ class b3Serializer;
 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3QuantizedBvhNodeData.h"
 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3BvhSubtreeInfoData.h"

-
-
 //http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp

-
 //Note: currently we have 16 bytes per quantized node
-#define MAX_SUBTREE_SIZE_IN_BYTES  2048
+#define MAX_SUBTREE_SIZE_IN_BYTES 2048

 // 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
 // actually) triangles each (since the sign bit is reserved
@@ -58,7 +55,8 @@ class b3Serializer;

 ///b3QuantizedBvhNode is a compressed aabb node, 16 bytes.
 ///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
-B3_ATTRIBUTE_ALIGNED16	(struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeData
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3QuantizedBvhNode : public b3QuantizedBvhNodeData
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();

@@ -72,48 +70,48 @@ B3_ATTRIBUTE_ALIGNED16	(struct) b3QuantizedBvhNode : public b3QuantizedBvhNodeDa
 		b3Assert(!isLeafNode());
 		return -m_escapeIndexOrTriangleIndex;
 	}
-	int	getTriangleIndex() const
+	int getTriangleIndex() const
 	{
 		b3Assert(isLeafNode());
-		unsigned int x=0;
-		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+		unsigned int x = 0;
+		unsigned int y = (~(x & 0)) << (31 - MAX_NUM_PARTS_IN_BITS);
 		// Get only the lower bits where the triangle index is stored
-		return (m_escapeIndexOrTriangleIndex&~(y));
+		return (m_escapeIndexOrTriangleIndex & ~(y));
 	}
-	int	getPartId() const
+	int getPartId() const
 	{
 		b3Assert(isLeafNode());
 		// Get only the highest bits where the part index is stored
-		return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));
+		return (m_escapeIndexOrTriangleIndex >> (31 - MAX_NUM_PARTS_IN_BITS));
 	}
-}
-;
+};

 /// b3OptimizedBvhNode contains both internal and leaf node information.
 /// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
-B3_ATTRIBUTE_ALIGNED16 (struct) b3OptimizedBvhNode
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3OptimizedBvhNode
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();

 	//32 bytes
-	b3Vector3	m_aabbMinOrg;
-	b3Vector3	m_aabbMaxOrg;
+	b3Vector3 m_aabbMinOrg;
+	b3Vector3 m_aabbMaxOrg;

 	//4
-	int	m_escapeIndex;
+	int m_escapeIndex;

 	//8
 	//for child nodes
-	int	m_subPart;
-	int	m_triangleIndex;
+	int m_subPart;
+	int m_triangleIndex;

-//pad the size to 64 bytes
-	char	m_padding[20];
+	//pad the size to 64 bytes
+	char m_padding[20];
 };

-
 ///b3BvhSubtreeInfo provides info to gather a subtree of limited size
-B3_ATTRIBUTE_ALIGNED16(class) b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
+B3_ATTRIBUTE_ALIGNED16(class)
+b3BvhSubtreeInfo : public b3BvhSubtreeInfoData
 {
 public:
 	B3_DECLARE_ALIGNED_ALLOCATOR();
@@ -123,8 +121,7 @@ public:
 		//memset(&m_padding[0], 0, sizeof(m_padding));
 	}

-
-	void	setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
+	void setAabbFromQuantizeNode(const b3QuantizedBvhNode& quantizedNode)
 	{
 		m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
 		m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
@@ -133,14 +130,12 @@ public:
 		m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
 		m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
 	}
-}
-;
-
+};

 class b3NodeOverlapCallback
 {
 public:
-	virtual ~b3NodeOverlapCallback() {};
+	virtual ~b3NodeOverlapCallback(){};

 	virtual void processNode(int subPart, int triangleIndex) = 0;
 };
@@ -148,18 +143,16 @@ public:
 #include "Bullet3Common/b3AlignedAllocator.h"
 #include "Bullet3Common/b3AlignedObjectArray.h"

-
-
 ///for code readability:
-typedef b3AlignedObjectArray<b3OptimizedBvhNode>	NodeArray;
-typedef b3AlignedObjectArray<b3QuantizedBvhNode>	QuantizedNodeArray;
-typedef b3AlignedObjectArray<b3BvhSubtreeInfo>		BvhSubtreeInfoArray;
-
+typedef b3AlignedObjectArray<b3OptimizedBvhNode> NodeArray;
+typedef b3AlignedObjectArray<b3QuantizedBvhNode> QuantizedNodeArray;
+typedef b3AlignedObjectArray<b3BvhSubtreeInfo> BvhSubtreeInfoArray;

 ///The b3QuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
 ///It is used by the b3BvhTriangleMeshShape as midphase
 ///It is recommended to use quantization for better performance and lower memory requirements.
-B3_ATTRIBUTE_ALIGNED16(class) b3QuantizedBvh
+B3_ATTRIBUTE_ALIGNED16(class)
+b3QuantizedBvh
 {
 public:
 	enum b3TraversalMode
@@ -169,56 +162,48 @@ public:
 		TRAVERSAL_RECURSIVE
 	};

-
-
-
-	b3Vector3			m_bvhAabbMin;
-	b3Vector3			m_bvhAabbMax;
-	b3Vector3			m_bvhQuantization;
+	b3Vector3 m_bvhAabbMin;
+	b3Vector3 m_bvhAabbMax;
+	b3Vector3 m_bvhQuantization;

 protected:
-	int					m_bulletVersion;	//for serialization versioning. It could also be used to detect endianess.
+	int m_bulletVersion;  //for serialization versioning. It could also be used to detect endianess.

-	int					m_curNodeIndex;
+	int m_curNodeIndex;
 	//quantization data
-	bool				m_useQuantization;
+	bool m_useQuantization;

+	NodeArray m_leafNodes;
+	NodeArray m_contiguousNodes;
+	QuantizedNodeArray m_quantizedLeafNodes;
+	QuantizedNodeArray m_quantizedContiguousNodes;

-
-	NodeArray			m_leafNodes;
-	NodeArray			m_contiguousNodes;
-	QuantizedNodeArray	m_quantizedLeafNodes;
-	QuantizedNodeArray	m_quantizedContiguousNodes;
-	
-	b3TraversalMode	m_traversalMode;
-	BvhSubtreeInfoArray		m_SubtreeHeaders;
+	b3TraversalMode m_traversalMode;
+	BvhSubtreeInfoArray m_SubtreeHeaders;

 	//This is only used for serialization so we don't have to add serialization directly to b3AlignedObjectArray
 	mutable int m_subtreeHeaderCount;

-	
-
-
-
 	///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
 	///this might be refactored into a virtual, it is usually not calculated at run-time
-	void	setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
+	void setInternalNodeAabbMin(int nodeIndex, const b3Vector3& aabbMin)
 	{
 		if (m_useQuantization)
 		{
-			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0);
-		} else
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0], aabbMin, 0);
+		}
+		else
 		{
 			m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
-
 		}
 	}
-	void	setInternalNodeAabbMax(int nodeIndex,const b3Vector3& aabbMax)
+	void setInternalNodeAabbMax(int nodeIndex, const b3Vector3& aabbMax)
 	{
 		if (m_useQuantization)
 		{
-			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1);
-		} else
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0], aabbMax, 1);
+		}
+		else
 		{
 			m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
 		}
@@ -232,115 +217,102 @@ protected:
 		}
 		//non-quantized
 		return m_leafNodes[nodeIndex].m_aabbMinOrg;
-
 	}
 	b3Vector3 getAabbMax(int nodeIndex) const
 	{
 		if (m_useQuantization)
 		{
 			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
-		} 
+		}
 		//non-quantized
 		return m_leafNodes[nodeIndex].m_aabbMaxOrg;
-		
 	}

-	
-	void	setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
+	void setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
 	{
 		if (m_useQuantization)
 		{
 			m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
-		} 
+		}
 		else
 		{
 			m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
 		}
-
 	}

-	void mergeInternalNodeAabb(int nodeIndex,const b3Vector3& newAabbMin,const b3Vector3& newAabbMax) 
+	void mergeInternalNodeAabb(int nodeIndex, const b3Vector3& newAabbMin, const b3Vector3& newAabbMax)
 	{
 		if (m_useQuantization)
 		{
 			unsigned short int quantizedAabbMin[3];
 			unsigned short int quantizedAabbMax[3];
-			quantize(quantizedAabbMin,newAabbMin,0);
-			quantize(quantizedAabbMax,newAabbMax,1);
-			for (int i=0;i<3;i++)
+			quantize(quantizedAabbMin, newAabbMin, 0);
+			quantize(quantizedAabbMax, newAabbMax, 1);
+			for (int i = 0; i < 3; i++)
 			{
 				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
 					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];

 				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
 					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
-
 			}
-		} else
+		}
+		else
 		{
 			//non-quantized
 			m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
-			m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);		
+			m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);
 		}
 	}

-	void	swapLeafNodes(int firstIndex,int secondIndex);
+	void swapLeafNodes(int firstIndex, int secondIndex);

-	void	assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex);
+	void assignInternalNodeFromLeafNode(int internalNode, int leafNodeIndex);

 protected:
+	void buildTree(int startIndex, int endIndex);

-	
+	int calcSplittingAxis(int startIndex, int endIndex);

-	void	buildTree	(int startIndex,int endIndex);
+	int sortAndCalcSplittingIndex(int startIndex, int endIndex, int splitAxis);

-	int	calcSplittingAxis(int startIndex,int endIndex);
+	void walkStacklessTree(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;

-	int	sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis);
-	
-	void	walkStacklessTree(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
-
-	void	walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
-	void	walkStacklessQuantizedTree(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const;
-	void	walkStacklessTreeAgainstRay(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
+	void walkStacklessQuantizedTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;
+	void walkStacklessQuantizedTree(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax, int startNodeIndex, int endNodeIndex) const;
+	void walkStacklessTreeAgainstRay(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax, int startNodeIndex, int endNodeIndex) const;

 	///tree traversal designed for small-memory processors like PS3 SPU
-	void	walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+	void walkStacklessQuantizedTreeCacheFriendly(b3NodeOverlapCallback * nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;

 	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
-	void	walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode,b3NodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+	void walkRecursiveQuantizedTreeAgainstQueryAabb(const b3QuantizedBvhNode* currentNode, b3NodeOverlapCallback* nodeCallback, unsigned short int* quantizedQueryAabbMin, unsigned short int* quantizedQueryAabbMax) const;

 	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
-	void	walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA,const b3QuantizedBvhNode* treeNodeB,b3NodeOverlapCallback* nodeCallback) const;
-	
+	void walkRecursiveQuantizedTreeAgainstQuantizedTree(const b3QuantizedBvhNode* treeNodeA, const b3QuantizedBvhNode* treeNodeB, b3NodeOverlapCallback* nodeCallback) const;

-
-
-	void	updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex);
+	void updateSubtreeHeaders(int leftChildNodexIndex, int rightChildNodexIndex);

 public:
-	
 	B3_DECLARE_ALIGNED_ALLOCATOR();

 	b3QuantizedBvh();

 	virtual ~b3QuantizedBvh();

-	
 	///***************************************** expert/internal use only *************************
-	void	setQuantizationValues(const b3Vector3& bvhAabbMin,const b3Vector3& bvhAabbMax,b3Scalar quantizationMargin=b3Scalar(1.0));
-	QuantizedNodeArray&	getLeafNodeArray() {			return	m_quantizedLeafNodes;	}
+	void setQuantizationValues(const b3Vector3& bvhAabbMin, const b3Vector3& bvhAabbMax, b3Scalar quantizationMargin = b3Scalar(1.0));
+	QuantizedNodeArray& getLeafNodeArray() { return m_quantizedLeafNodes; }
 	///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
-	void	buildInternal();
+	void buildInternal();
 	///***************************************** expert/internal use only *************************

-	void	reportAabbOverlappingNodex(b3NodeOverlapCallback* nodeCallback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
-	void	reportRayOverlappingNodex (b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
-	void	reportBoxCastOverlappingNodex(b3NodeOverlapCallback* nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+	void reportAabbOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
+	void reportRayOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget) const;
+	void reportBoxCastOverlappingNodex(b3NodeOverlapCallback * nodeCallback, const b3Vector3& raySource, const b3Vector3& rayTarget, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;

-		B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point,int isMax) const
+	B3_FORCE_INLINE void quantize(unsigned short* out, const b3Vector3& point, int isMax) const
 	{
-
 		b3Assert(m_useQuantization);

 		b3Assert(point.getX() <= m_bvhAabbMax.getX());
@@ -357,16 +329,16 @@ public:
 		///@todo: double-check this
 		if (isMax)
 		{
-			out[0] = (unsigned short) (((unsigned short)(v.getX()+b3Scalar(1.)) | 1));
-			out[1] = (unsigned short) (((unsigned short)(v.getY()+b3Scalar(1.)) | 1));
-			out[2] = (unsigned short) (((unsigned short)(v.getZ()+b3Scalar(1.)) | 1));
-		} else
-		{
-			out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe));
-			out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe));
-			out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe));
+			out[0] = (unsigned short)(((unsigned short)(v.getX() + b3Scalar(1.)) | 1));
+			out[1] = (unsigned short)(((unsigned short)(v.getY() + b3Scalar(1.)) | 1));
+			out[2] = (unsigned short)(((unsigned short)(v.getZ() + b3Scalar(1.)) | 1));
+		}
+		else
+		{
+			out[0] = (unsigned short)(((unsigned short)(v.getX()) & 0xfffe));
+			out[1] = (unsigned short)(((unsigned short)(v.getY()) & 0xfffe));
+			out[2] = (unsigned short)(((unsigned short)(v.getZ()) & 0xfffe));
 		}
-

 #ifdef DEBUG_CHECK_DEQUANTIZATION
 		b3Vector3 newPoint = unQuantize(out);
@@ -374,105 +346,97 @@ public:
 		{
 			if (newPoint.getX() < point.getX())
 			{
-				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
 			}
 			if (newPoint.getY() < point.getY())
 			{
-				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
 			}
 			if (newPoint.getZ() < point.getZ())
 			{
-
-				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
 			}
-		} else
+		}
+		else
 		{
 			if (newPoint.getX() > point.getX())
 			{
-				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n", newPoint.getX() - point.getX(), newPoint.getX(), point.getX());
 			}
 			if (newPoint.getY() > point.getY())
 			{
-				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n", newPoint.getY() - point.getY(), newPoint.getY(), point.getY());
 			}
 			if (newPoint.getZ() > point.getZ())
 			{
-				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n", newPoint.getZ() - point.getZ(), newPoint.getZ(), point.getZ());
 			}
 		}
-#endif //DEBUG_CHECK_DEQUANTIZATION
-
+#endif  //DEBUG_CHECK_DEQUANTIZATION
 	}

-
-	B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2,int isMax) const
+	B3_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const b3Vector3& point2, int isMax) const
 	{
-
 		b3Assert(m_useQuantization);

 		b3Vector3 clampedPoint(point2);
 		clampedPoint.setMax(m_bvhAabbMin);
 		clampedPoint.setMin(m_bvhAabbMax);

-		quantize(out,clampedPoint,isMax);
-
+		quantize(out, clampedPoint, isMax);
 	}
-	
-	B3_FORCE_INLINE b3Vector3	unQuantize(const unsigned short* vecIn) const
+
+	B3_FORCE_INLINE b3Vector3 unQuantize(const unsigned short* vecIn) const
 	{
-			b3Vector3	vecOut;
-			vecOut.setValue(
+		b3Vector3 vecOut;
+		vecOut.setValue(
 			(b3Scalar)(vecIn[0]) / (m_bvhQuantization.getX()),
 			(b3Scalar)(vecIn[1]) / (m_bvhQuantization.getY()),
 			(b3Scalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
-			vecOut += m_bvhAabbMin;
-			return vecOut;
+		vecOut += m_bvhAabbMin;
+		return vecOut;
 	}

 	///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
-	void	setTraversalMode(b3TraversalMode	traversalMode)
+	void setTraversalMode(b3TraversalMode traversalMode)
 	{
 		m_traversalMode = traversalMode;
 	}

-
-	B3_FORCE_INLINE QuantizedNodeArray&	getQuantizedNodeArray()
-	{	
-		return	m_quantizedContiguousNodes;
+	B3_FORCE_INLINE QuantizedNodeArray& getQuantizedNodeArray()
+	{
+		return m_quantizedContiguousNodes;
 	}

-
-	B3_FORCE_INLINE BvhSubtreeInfoArray&	getSubtreeInfoArray()
+	B3_FORCE_INLINE BvhSubtreeInfoArray& getSubtreeInfoArray()
 	{
 		return m_SubtreeHeaders;
 	}

-////////////////////////////////////////////////////////////////////
+	////////////////////////////////////////////////////////////////////

 	/////Calculate space needed to store BVH for serialization
 	unsigned calculateSerializeBufferSize() const;

 	/// Data buffer MUST be 16 byte aligned
-	virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
+	virtual bool serialize(void* o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;

 	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
-	static b3QuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
+	static b3QuantizedBvh* deSerializeInPlace(void* i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);

 	static unsigned int getAlignmentSerializationPadding();
-//////////////////////////////////////////////////////////////////////
+	//////////////////////////////////////////////////////////////////////

-	
-	virtual	int	calculateSerializeBufferSizeNew() const;
+	virtual int calculateSerializeBufferSizeNew() const;

 	///fills the dataBuffer and returns the struct name (and 0 on failure)
-	virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+	virtual const char* serialize(void* dataBuffer, b3Serializer* serializer) const;

-	virtual	void deSerializeFloat(struct b3QuantizedBvhFloatData& quantizedBvhFloatData);
+	virtual void deSerializeFloat(struct b3QuantizedBvhFloatData & quantizedBvhFloatData);

-	virtual	void deSerializeDouble(struct b3QuantizedBvhDoubleData& quantizedBvhDoubleData);
+	virtual void deSerializeDouble(struct b3QuantizedBvhDoubleData & quantizedBvhDoubleData);

-
-////////////////////////////////////////////////////////////////////
+	////////////////////////////////////////////////////////////////////

 	B3_FORCE_INLINE bool isQuantized()
 	{
@@ -483,74 +447,65 @@ private:
 	// Special "copy" constructor that allows for in-place deserialization
 	// Prevents b3Vector3's default constructor from being called, but doesn't inialize much else
 	// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
-	b3QuantizedBvh(b3QuantizedBvh &other, bool ownsMemory);
-
-}
-;
-
+	b3QuantizedBvh(b3QuantizedBvh & other, bool ownsMemory);
+};

 struct b3OptimizedBvhNodeFloatData
 {
-	b3Vector3FloatData	m_aabbMinOrg;
-	b3Vector3FloatData	m_aabbMaxOrg;
-	int	m_escapeIndex;
-	int	m_subPart;
-	int	m_triangleIndex;
+	b3Vector3FloatData m_aabbMinOrg;
+	b3Vector3FloatData m_aabbMaxOrg;
+	int m_escapeIndex;
+	int m_subPart;
+	int m_triangleIndex;
 	char m_pad[4];
 };

 struct b3OptimizedBvhNodeDoubleData
 {
-	b3Vector3DoubleData	m_aabbMinOrg;
-	b3Vector3DoubleData	m_aabbMaxOrg;
-	int	m_escapeIndex;
-	int	m_subPart;
-	int	m_triangleIndex;
-	char	m_pad[4];
+	b3Vector3DoubleData m_aabbMinOrg;
+	b3Vector3DoubleData m_aabbMaxOrg;
+	int m_escapeIndex;
+	int m_subPart;
+	int m_triangleIndex;
+	char m_pad[4];
 };

-
-
-struct	b3QuantizedBvhFloatData
+struct b3QuantizedBvhFloatData
 {
-	b3Vector3FloatData			m_bvhAabbMin;
-	b3Vector3FloatData			m_bvhAabbMax;
-	b3Vector3FloatData			m_bvhQuantization;
-	int					m_curNodeIndex;
-	int					m_useQuantization;
-	int					m_numContiguousLeafNodes;
-	int					m_numQuantizedContiguousNodes;
-	b3OptimizedBvhNodeFloatData	*m_contiguousNodesPtr;
-	b3QuantizedBvhNodeData		*m_quantizedContiguousNodesPtr;
-	b3BvhSubtreeInfoData	*m_subTreeInfoPtr;
-	int					m_traversalMode;
-	int					m_numSubtreeHeaders;
-	
+	b3Vector3FloatData m_bvhAabbMin;
+	b3Vector3FloatData m_bvhAabbMax;
+	b3Vector3FloatData m_bvhQuantization;
+	int m_curNodeIndex;
+	int m_useQuantization;
+	int m_numContiguousLeafNodes;
+	int m_numQuantizedContiguousNodes;
+	b3OptimizedBvhNodeFloatData* m_contiguousNodesPtr;
+	b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;
+	b3BvhSubtreeInfoData* m_subTreeInfoPtr;
+	int m_traversalMode;
+	int m_numSubtreeHeaders;
 };

-struct	b3QuantizedBvhDoubleData
+struct b3QuantizedBvhDoubleData
 {
-	b3Vector3DoubleData			m_bvhAabbMin;
-	b3Vector3DoubleData			m_bvhAabbMax;
-	b3Vector3DoubleData			m_bvhQuantization;
-	int							m_curNodeIndex;
-	int							m_useQuantization;
-	int							m_numContiguousLeafNodes;
-	int							m_numQuantizedContiguousNodes;
-	b3OptimizedBvhNodeDoubleData	*m_contiguousNodesPtr;
-	b3QuantizedBvhNodeData			*m_quantizedContiguousNodesPtr;
+	b3Vector3DoubleData m_bvhAabbMin;
+	b3Vector3DoubleData m_bvhAabbMax;
+	b3Vector3DoubleData m_bvhQuantization;
+	int m_curNodeIndex;
+	int m_useQuantization;
+	int m_numContiguousLeafNodes;
+	int m_numQuantizedContiguousNodes;
+	b3OptimizedBvhNodeDoubleData* m_contiguousNodesPtr;
+	b3QuantizedBvhNodeData* m_quantizedContiguousNodesPtr;

-	int							m_traversalMode;
-	int							m_numSubtreeHeaders;
-	b3BvhSubtreeInfoData		*m_subTreeInfoPtr;
+	int m_traversalMode;
+	int m_numSubtreeHeaders;
+	b3BvhSubtreeInfoData* m_subTreeInfoPtr;
 };

-
-B3_FORCE_INLINE	int	b3QuantizedBvh::calculateSerializeBufferSizeNew() const
+B3_FORCE_INLINE int b3QuantizedBvh::calculateSerializeBufferSizeNew() const
 {
 	return sizeof(b3QuantizedBvhData);
 }

-
-
-#endif //B3_QUANTIZED_BVH_H
+#endif  //B3_QUANTIZED_BVH_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.cpp
@@ -15,35 +15,32 @@ subject to the following restrictions:

 #include "b3StridingMeshInterface.h"

-
 b3StridingMeshInterface::~b3StridingMeshInterface()
 {
-
 }

-
-void	b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const
+void b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
 {
 	(void)aabbMin;
 	(void)aabbMax;
 	int numtotalphysicsverts = 0;
-	int part,graphicssubparts = getNumSubParts();
-	const unsigned char * vertexbase;
-	const unsigned char * indexbase;
+	int part, graphicssubparts = getNumSubParts();
+	const unsigned char* vertexbase;
+	const unsigned char* indexbase;
 	int indexstride;
 	PHY_ScalarType type;
 	PHY_ScalarType gfxindextype;
-	int stride,numverts,numtriangles;
+	int stride, numverts, numtriangles;
 	int gfxindex;
 	b3Vector3 triangle[3];

 	b3Vector3 meshScaling = getScaling();

 	///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
-	for (part=0;part<graphicssubparts ;part++)
+	for (part = 0; part < graphicssubparts; part++)
 	{
-		getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part);
-		numtotalphysicsverts+=numtriangles*3; //upper bound
+		getLockedReadOnlyVertexIndexBase(&vertexbase, numverts, type, stride, &indexbase, indexstride, numtriangles, gfxindextype, part);
+		numtotalphysicsverts += numtriangles * 3;  //upper bound

 		///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
 		///so disable this feature by default
@@ -51,143 +48,141 @@ void	b3StridingMeshInterface::InternalProcessAllTriangles(b3InternalTriangleInde

 		switch (type)
 		{
-		case PHY_FLOAT:
-		 {
+			case PHY_FLOAT:
+			{
+				float* graphicsbase;

-			 float* graphicsbase;
+				switch (gfxindextype)
+				{
+					case PHY_INTEGER:
+					{
+						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
+						{
+							unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
+							graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
+							triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
+							triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
+							triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
+						}
+						break;
+					}
+					case PHY_SHORT:
+					{
+						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
+						{
+							unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
+							graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
+							triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
+							triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
+							triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
+						}
+						break;
+					}
+					case PHY_UCHAR:
+					{
+						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
+						{
+							unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
+							graphicsbase = (float*)(vertexbase + tri_indices[0] * stride);
+							triangle[0].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (float*)(vertexbase + tri_indices[1] * stride);
+							triangle[1].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (float*)(vertexbase + tri_indices[2] * stride);
+							triangle[2].setValue(graphicsbase[0] * meshScaling.getX(), graphicsbase[1] * meshScaling.getY(), graphicsbase[2] * meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
+						}
+						break;
+					}
+					default:
+						b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+				}
+				break;
+			}

-			 switch (gfxindextype)
-			 {
-			 case PHY_INTEGER:
-				 {
-					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
-					 {
-						 unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
-						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
-						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
-						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
-						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
-						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
-						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
-						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
-					 }
-					 break;
-				 }
-			 case PHY_SHORT:
-				 {
-					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
-					 {
-						 unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
-						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
-						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
-						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
-						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
-						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
-						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
-						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
-					 }
-					 break;
-				 }
-			case PHY_UCHAR:
-				 {
-					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
-					 {
-						 unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
-						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
-						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
-						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
-						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
-						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
-						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
-						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
-					 }
-					 break;
-				 }
-			 default:
-				 b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
-			 }
-			 break;
-		 }
-
-		case PHY_DOUBLE:
+			case PHY_DOUBLE:
 			{
 				double* graphicsbase;

 				switch (gfxindextype)
 				{
-				case PHY_INTEGER:
+					case PHY_INTEGER:
 					{
-						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
 						{
-							unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
-							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
-							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
-							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
-							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+							unsigned int* tri_indices = (unsigned int*)(indexbase + gfxindex * indexstride);
+							graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
 						}
 						break;
 					}
-				case PHY_SHORT:
+					case PHY_SHORT:
 					{
-						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
 						{
-							unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
-							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
-							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
-							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
-							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+							unsigned short int* tri_indices = (unsigned short int*)(indexbase + gfxindex * indexstride);
+							graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
 						}
 						break;
 					}
-				case PHY_UCHAR:
+					case PHY_UCHAR:
 					{
-						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						for (gfxindex = 0; gfxindex < numtriangles; gfxindex++)
 						{
-							unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
-							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
-							triangle[0].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),(b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
-							triangle[1].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
-							triangle[2].setValue((b3Scalar)graphicsbase[0]*meshScaling.getX(),(b3Scalar)graphicsbase[1]*meshScaling.getY(),  (b3Scalar)graphicsbase[2]*meshScaling.getZ());
-							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+							unsigned char* tri_indices = (unsigned char*)(indexbase + gfxindex * indexstride);
+							graphicsbase = (double*)(vertexbase + tri_indices[0] * stride);
+							triangle[0].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase + tri_indices[1] * stride);
+							triangle[1].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase + tri_indices[2] * stride);
+							triangle[2].setValue((b3Scalar)graphicsbase[0] * meshScaling.getX(), (b3Scalar)graphicsbase[1] * meshScaling.getY(), (b3Scalar)graphicsbase[2] * meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle, part, gfxindex);
 						}
 						break;
 					}
-				default:
-					b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+					default:
+						b3Assert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
 				}
 				break;
 			}
-		default:
-			b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
+			default:
+				b3Assert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
 		}

 		unLockReadOnlyVertexBase(part);
 	}
 }

-void	b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax)
+void b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin, b3Vector3& aabbMax)
 {
-
-	struct	AabbCalculationCallback : public b3InternalTriangleIndexCallback
+	struct AabbCalculationCallback : public b3InternalTriangleIndexCallback
 	{
-		b3Vector3	m_aabbMin;
-		b3Vector3	m_aabbMax;
+		b3Vector3 m_aabbMin;
+		b3Vector3 m_aabbMax;

 		AabbCalculationCallback()
 		{
-			m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
-			m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
+			m_aabbMin.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
+			m_aabbMax.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
 		}

-		virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex)
+		virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex)
 		{
 			(void)partId;
 			(void)triangleIndex;
@@ -202,13 +197,11 @@ void	b3StridingMeshInterface::calculateAabbBruteForce(b3Vector3& aabbMin,b3Vecto
 	};

 	//first calculate the total aabb for all triangles
-	AabbCalculationCallback	aabbCallback;
-	aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT),b3Scalar(-B3_LARGE_FLOAT));
-	aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
-	InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax);
+	AabbCalculationCallback aabbCallback;
+	aabbMin.setValue(b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT), b3Scalar(-B3_LARGE_FLOAT));
+	aabbMax.setValue(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
+	InternalProcessAllTriangles(&aabbCallback, aabbMin, aabbMax);

 	aabbMin = aabbCallback.m_aabbMin;
 	aabbMax = aabbCallback.m_aabbMax;
 }
-
-
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3StridingMeshInterface.h
@@ -20,148 +20,139 @@ subject to the following restrictions:
 #include "b3TriangleCallback.h"
 //#include "b3ConcaveShape.h"

-
-enum  	PHY_ScalarType { 
-  PHY_FLOAT, PHY_DOUBLE, PHY_INTEGER, PHY_SHORT, 
-  PHY_FIXEDPOINT88, PHY_UCHAR 
+enum PHY_ScalarType
+{
+	PHY_FLOAT,
+	PHY_DOUBLE,
+	PHY_INTEGER,
+	PHY_SHORT,
+	PHY_FIXEDPOINT88,
+	PHY_UCHAR
 };

-
 ///	The b3StridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with b3BvhTriangleMeshShape and some other collision shapes.
 /// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
 /// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
-B3_ATTRIBUTE_ALIGNED16(class ) b3StridingMeshInterface
+B3_ATTRIBUTE_ALIGNED16(class)
+b3StridingMeshInterface
 {
-	protected:
-	
-		b3Vector3 m_scaling;
+protected:
+	b3Vector3 m_scaling;

-	public:
-		B3_DECLARE_ALIGNED_ALLOCATOR();
-		
-		b3StridingMeshInterface() :m_scaling(b3MakeVector3(b3Scalar(1.),b3Scalar(1.),b3Scalar(1.)))
-		{
+public:
+	B3_DECLARE_ALIGNED_ALLOCATOR();

-		}
+	b3StridingMeshInterface() : m_scaling(b3MakeVector3(b3Scalar(1.), b3Scalar(1.), b3Scalar(1.)))
+	{
+	}

-		virtual ~b3StridingMeshInterface();
+	virtual ~b3StridingMeshInterface();

+	virtual void InternalProcessAllTriangles(b3InternalTriangleIndexCallback * callback, const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;

+	///brute force method to calculate aabb
+	void calculateAabbBruteForce(b3Vector3 & aabbMin, b3Vector3 & aabbMax);

-		virtual void	InternalProcessAllTriangles(b3InternalTriangleIndexCallback* callback,const b3Vector3& aabbMin,const b3Vector3& aabbMax) const;
+	/// get read and write access to a subpart of a triangle mesh
+	/// this subpart has a continuous array of vertices and indices
+	/// in this way the mesh can be handled as chunks of memory with striding
+	/// very similar to OpenGL vertexarray support
+	/// make a call to unLockVertexBase when the read and write access is finished
+	virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) = 0;

-		///brute force method to calculate aabb
-		void	calculateAabbBruteForce(b3Vector3& aabbMin,b3Vector3& aabbMax);
+	virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& stride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const = 0;

-		/// get read and write access to a subpart of a triangle mesh
-		/// this subpart has a continuous array of vertices and indices
-		/// in this way the mesh can be handled as chunks of memory with striding
-		/// very similar to OpenGL vertexarray support
-		/// make a call to unLockVertexBase when the read and write access is finished	
-		virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0;
-		
-		virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0;
-	
-		/// unLockVertexBase finishes the access to a subpart of the triangle mesh
-		/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
-		virtual void	unLockVertexBase(int subpart)=0;
+	/// unLockVertexBase finishes the access to a subpart of the triangle mesh
+	/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
+	virtual void unLockVertexBase(int subpart) = 0;

-		virtual void	unLockReadOnlyVertexBase(int subpart) const=0;
+	virtual void unLockReadOnlyVertexBase(int subpart) const = 0;

+	/// getNumSubParts returns the number of seperate subparts
+	/// each subpart has a continuous array of vertices and indices
+	virtual int getNumSubParts() const = 0;

-		/// getNumSubParts returns the number of seperate subparts
-		/// each subpart has a continuous array of vertices and indices
-		virtual int		getNumSubParts() const=0;
+	virtual void preallocateVertices(int numverts) = 0;
+	virtual void preallocateIndices(int numindices) = 0;

-		virtual void	preallocateVertices(int numverts)=0;
-		virtual void	preallocateIndices(int numindices)=0;
+	virtual bool hasPremadeAabb() const { return false; }
+	virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
+	{
+		(void)aabbMin;
+		(void)aabbMax;
+	}
+	virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const
+	{
+		(void)aabbMin;
+		(void)aabbMax;
+	}

-		virtual bool	hasPremadeAabb() const { return false; }
-		virtual void	setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
-                {
-                        (void) aabbMin;
-                        (void) aabbMax;
-                }
-		virtual void	getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
-        {
-            (void) aabbMin;
-            (void) aabbMax;
-        }
-
-		const b3Vector3&	getScaling() const {
-			return m_scaling;
-		}
-		void	setScaling(const b3Vector3& scaling)
-		{
-			m_scaling = scaling;
-		}
-
-		virtual	int	calculateSerializeBufferSize() const;
-
-		///fills the dataBuffer and returns the struct name (and 0 on failure)
-		//virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
+	const b3Vector3& getScaling() const
+	{
+		return m_scaling;
+	}
+	void setScaling(const b3Vector3& scaling)
+	{
+		m_scaling = scaling;
+	}

+	virtual int calculateSerializeBufferSize() const;

+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	//virtual	const char*	serialize(void* dataBuffer, b3Serializer* serializer) const;
 };

-struct	b3IntIndexData
+struct b3IntIndexData
 {
-	int	m_value;
+	int m_value;
 };

-struct	b3ShortIntIndexData
+struct b3ShortIntIndexData
 {
 	short m_value;
 	char m_pad[2];
 };

-struct	b3ShortIntIndexTripletData
+struct b3ShortIntIndexTripletData
 {
-	short	m_values[3];
-	char	m_pad[2];
+	short m_values[3];
+	char m_pad[2];
 };

-struct	b3CharIndexTripletData
+struct b3CharIndexTripletData
 {
 	unsigned char m_values[3];
-	char	m_pad;
+	char m_pad;
 };

-
 ///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
-struct	b3MeshPartData
+struct b3MeshPartData
 {
-	b3Vector3FloatData			*m_vertices3f;
-	b3Vector3DoubleData			*m_vertices3d;
+	b3Vector3FloatData* m_vertices3f;
+	b3Vector3DoubleData* m_vertices3d;

-	b3IntIndexData				*m_indices32;
-	b3ShortIntIndexTripletData	*m_3indices16;
-	b3CharIndexTripletData		*m_3indices8;
+	b3IntIndexData* m_indices32;
+	b3ShortIntIndexTripletData* m_3indices16;
+	b3CharIndexTripletData* m_3indices8;

-	b3ShortIntIndexData			*m_indices16;//backwards compatibility
+	b3ShortIntIndexData* m_indices16;  //backwards compatibility

-	int                     m_numTriangles;//length of m_indices = m_numTriangles
-	int                     m_numVertices;
+	int m_numTriangles;  //length of m_indices = m_numTriangles
+	int m_numVertices;
 };

-
 ///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
-struct	b3StridingMeshInterfaceData
+struct b3StridingMeshInterfaceData
 {
-	b3MeshPartData	*m_meshPartsPtr;
-	b3Vector3FloatData	m_scaling;
-	int	m_numMeshParts;
+	b3MeshPartData* m_meshPartsPtr;
+	b3Vector3FloatData m_scaling;
+	int m_numMeshParts;
 	char m_padding[4];
 };

-
-
-
-B3_FORCE_INLINE	int	b3StridingMeshInterface::calculateSerializeBufferSize() const
+B3_FORCE_INLINE int b3StridingMeshInterface::calculateSerializeBufferSize() const
 {
 	return sizeof(b3StridingMeshInterfaceData);
 }

-
-
-#endif //B3_STRIDING_MESHINTERFACE_H
+#endif  //B3_STRIDING_MESHINTERFACE_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3SupportMappings.h
@@ -6,33 +6,29 @@
 #include "Bullet3Common/b3AlignedObjectArray.h"
 #include "b3VectorFloat4.h"

-
 struct b3GjkPairDetector;

-
-
-inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, 
-	const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
+inline b3Vector3 localGetSupportVertexWithMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
+												 const b3AlignedObjectArray<b3Vector3>& verticesA, b3Scalar margin)
 {
-	b3Vector3 supVec = b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+	b3Vector3 supVec = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
 	b3Scalar maxDot = b3Scalar(-B3_LARGE_FLOAT);

-    // Here we take advantage of dot(a, b*c) = dot(a*b, c).  Note: This is true mathematically, but not numerically. 
-    if( 0 < hull->m_numVertices )
-    {
-        const b3Vector3 scaled = supportVec;
-		int index = (int) scaled.maxDot( &verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot); 
-        return verticesA[hull->m_vertexOffset+index];
-    }
-
-    return supVec;
+	// Here we take advantage of dot(a, b*c) = dot(a*b, c).  Note: This is true mathematically, but not numerically.
+	if (0 < hull->m_numVertices)
+	{
+		const b3Vector3 scaled = supportVec;
+		int index = (int)scaled.maxDot(&verticesA[hull->m_vertexOffset], hull->m_numVertices, maxDot);
+		return verticesA[hull->m_vertexOffset + index];
+	}

+	return supVec;
 }

-inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec,const struct b3ConvexPolyhedronData* hull, 
-	const b3AlignedObjectArray<b3Vector3>& verticesA)
+inline b3Vector3 localGetSupportVertexWithoutMargin(const float4& supportVec, const struct b3ConvexPolyhedronData* hull,
+													const b3AlignedObjectArray<b3Vector3>& verticesA)
 {
-	return localGetSupportVertexWithMargin(supportVec,hull,verticesA,0.f);
+	return localGetSupportVertexWithMargin(supportVec, hull, verticesA, 0.f);
 }

-#endif //B3_SUPPORT_MAPPINGS_H
+#endif  //B3_SUPPORT_MAPPINGS_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.cpp
@@ -17,12 +17,8 @@ subject to the following restrictions:

 b3TriangleCallback::~b3TriangleCallback()
 {
-	
 }

-
 b3InternalTriangleIndexCallback::~b3InternalTriangleIndexCallback()
 {
-
 }
-
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleCallback.h
@@ -18,13 +18,11 @@ subject to the following restrictions:

 #include "Bullet3Common/b3Vector3.h"

-
 ///The b3TriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
 ///This callback is called by processAllTriangles for all b3ConcaveShape derived class, such as  b3BvhTriangleMeshShape, b3StaticPlaneShape and b3HeightfieldTerrainShape.
 class b3TriangleCallback
 {
 public:
-
 	virtual ~b3TriangleCallback();
 	virtual void processTriangle(b3Vector3* triangle, int partId, int triangleIndex) = 0;
 };
@@ -32,11 +30,8 @@ public:
 class b3InternalTriangleIndexCallback
 {
 public:
-
 	virtual ~b3InternalTriangleIndexCallback();
-	virtual void internalProcessTriangleIndex(b3Vector3* triangle,int partId,int  triangleIndex) = 0;
+	virtual void internalProcessTriangleIndex(b3Vector3* triangle, int partId, int triangleIndex) = 0;
 };

-
-
-#endif //B3_TRIANGLE_CALLBACK_H
+#endif  //B3_TRIANGLE_CALLBACK_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.cpp
@@ -15,81 +15,76 @@ subject to the following restrictions:

 #include "b3TriangleIndexVertexArray.h"

-b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride)
-: m_hasAabb(0)
+b3TriangleIndexVertexArray::b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride)
+	: m_hasAabb(0)
 {
 	b3IndexedMesh mesh;

 	mesh.m_numTriangles = numTriangles;
-	mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase;
+	mesh.m_triangleIndexBase = (const unsigned char*)triangleIndexBase;
 	mesh.m_triangleIndexStride = triangleIndexStride;
 	mesh.m_numVertices = numVertices;
-	mesh.m_vertexBase = (const unsigned char *)vertexBase;
+	mesh.m_vertexBase = (const unsigned char*)vertexBase;
 	mesh.m_vertexStride = vertexStride;

 	addIndexedMesh(mesh);
-
 }

 b3TriangleIndexVertexArray::~b3TriangleIndexVertexArray()
 {
-
 }

-void	b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart)
+void b3TriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart)
 {
-	b3Assert(subpart< getNumSubParts() );
+	b3Assert(subpart < getNumSubParts());

 	b3IndexedMesh& mesh = m_indexedMeshes[subpart];

 	numverts = mesh.m_numVertices;
-	(*vertexbase) = (unsigned char *) mesh.m_vertexBase;
+	(*vertexbase) = (unsigned char*)mesh.m_vertexBase;

-   type = mesh.m_vertexType;
+	type = mesh.m_vertexType;

 	vertexStride = mesh.m_vertexStride;

 	numfaces = mesh.m_numTriangles;

-	(*indexbase) = (unsigned char *)mesh.m_triangleIndexBase;
+	(*indexbase) = (unsigned char*)mesh.m_triangleIndexBase;
 	indexstride = mesh.m_triangleIndexStride;
 	indicestype = mesh.m_indexType;
 }

-void	b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const
+void b3TriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart) const
 {
 	const b3IndexedMesh& mesh = m_indexedMeshes[subpart];

 	numverts = mesh.m_numVertices;
-	(*vertexbase) = (const unsigned char *)mesh.m_vertexBase;
+	(*vertexbase) = (const unsigned char*)mesh.m_vertexBase;
+
+	type = mesh.m_vertexType;

-   type = mesh.m_vertexType;
-   
 	vertexStride = mesh.m_vertexStride;

 	numfaces = mesh.m_numTriangles;
-	(*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase;
+	(*indexbase) = (const unsigned char*)mesh.m_triangleIndexBase;
 	indexstride = mesh.m_triangleIndexStride;
 	indicestype = mesh.m_indexType;
 }

-bool	b3TriangleIndexVertexArray::hasPremadeAabb() const
+bool b3TriangleIndexVertexArray::hasPremadeAabb() const
 {
 	return (m_hasAabb == 1);
 }

-
-void	b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const
+void b3TriangleIndexVertexArray::setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const
 {
 	m_aabbMin = aabbMin;
 	m_aabbMax = aabbMax;
-	m_hasAabb = 1; // this is intentionally an int see notes in header
+	m_hasAabb = 1;  // this is intentionally an int see notes in header
 }

-void	b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const
+void b3TriangleIndexVertexArray::getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax) const
 {
 	*aabbMin = m_aabbMin;
 	*aabbMax = m_aabbMax;
 }
-
-
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3TriangleIndexVertexArray.h
@@ -20,62 +20,59 @@ subject to the following restrictions:
 #include "Bullet3Common/b3AlignedObjectArray.h"
 #include "Bullet3Common/b3Scalar.h"

-
 ///The b3IndexedMesh indexes a single vertex and index array. Multiple b3IndexedMesh objects can be passed into a b3TriangleIndexVertexArray using addIndexedMesh.
 ///Instead of the number of indices, we pass the number of triangles.
-B3_ATTRIBUTE_ALIGNED16( struct)	b3IndexedMesh
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3IndexedMesh
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();

-   int                     m_numTriangles;
-   const unsigned char *   m_triangleIndexBase;
-   // Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
-   int                     m_triangleIndexStride;
-   int                     m_numVertices;
-   const unsigned char *   m_vertexBase;
-   // Size of a vertex, in bytes
-   int                     m_vertexStride;
+	int m_numTriangles;
+	const unsigned char* m_triangleIndexBase;
+	// Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
+	int m_triangleIndexStride;
+	int m_numVertices;
+	const unsigned char* m_vertexBase;
+	// Size of a vertex, in bytes
+	int m_vertexStride;

-   // The index type is set when adding an indexed mesh to the
-   // b3TriangleIndexVertexArray, do not set it manually
-   PHY_ScalarType m_indexType;
+	// The index type is set when adding an indexed mesh to the
+	// b3TriangleIndexVertexArray, do not set it manually
+	PHY_ScalarType m_indexType;

-   // The vertex type has a default type similar to Bullet's precision mode (float or double)
-   // but can be set manually if you for example run Bullet with double precision but have
-   // mesh data in single precision..
-   PHY_ScalarType m_vertexType;
+	// The vertex type has a default type similar to Bullet's precision mode (float or double)
+	// but can be set manually if you for example run Bullet with double precision but have
+	// mesh data in single precision..
+	PHY_ScalarType m_vertexType;

-
-   b3IndexedMesh()
-	   :m_indexType(PHY_INTEGER),
+	b3IndexedMesh()
+		: m_indexType(PHY_INTEGER),
 #ifdef B3_USE_DOUBLE_PRECISION
-      m_vertexType(PHY_DOUBLE)
-#else // B3_USE_DOUBLE_PRECISION
-      m_vertexType(PHY_FLOAT)
-#endif // B3_USE_DOUBLE_PRECISION
-      {
-      }
-}
-;
+		  m_vertexType(PHY_DOUBLE)
+#else   // B3_USE_DOUBLE_PRECISION
+		  m_vertexType(PHY_FLOAT)
+#endif  // B3_USE_DOUBLE_PRECISION
+	{
+	}
+};

-
-typedef b3AlignedObjectArray<b3IndexedMesh>	IndexedMeshArray;
+typedef b3AlignedObjectArray<b3IndexedMesh> IndexedMeshArray;

 ///The b3TriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
 ///Additional meshes can be added using addIndexedMesh
 ///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
 ///So keep those arrays around during the lifetime of this b3TriangleIndexVertexArray.
-B3_ATTRIBUTE_ALIGNED16( class) b3TriangleIndexVertexArray : public b3StridingMeshInterface
+B3_ATTRIBUTE_ALIGNED16(class)
+b3TriangleIndexVertexArray : public b3StridingMeshInterface
 {
 protected:
-	IndexedMeshArray	m_indexedMeshes;
+	IndexedMeshArray m_indexedMeshes;
 	int m_pad[2];
-	mutable int m_hasAabb; // using int instead of bool to maintain alignment
+	mutable int m_hasAabb;  // using int instead of bool to maintain alignment
 	mutable b3Vector3 m_aabbMin;
 	mutable b3Vector3 m_aabbMax;

 public:
-
 	B3_DECLARE_ALIGNED_ALLOCATOR();

 	b3TriangleIndexVertexArray() : m_hasAabb(0)
@@ -85,49 +82,47 @@ public:
 	virtual ~b3TriangleIndexVertexArray();

 	//just to be backwards compatible
-	b3TriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,b3Scalar* vertexBase,int vertexStride);
-	
-	void	addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
+	b3TriangleIndexVertexArray(int numTriangles, int* triangleIndexBase, int triangleIndexStride, int numVertices, b3Scalar* vertexBase, int vertexStride);
+
+	void addIndexedMesh(const b3IndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
 	{
 		m_indexedMeshes.push_back(mesh);
-		m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType;
+		m_indexedMeshes[m_indexedMeshes.size() - 1].m_indexType = indexType;
 	}
-	
-	
-	virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0);

-	virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const;
+	virtual void getLockedVertexIndexBase(unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0);
+
+	virtual void getLockedReadOnlyVertexIndexBase(const unsigned char** vertexbase, int& numverts, PHY_ScalarType& type, int& vertexStride, const unsigned char** indexbase, int& indexstride, int& numfaces, PHY_ScalarType& indicestype, int subpart = 0) const;

 	/// unLockVertexBase finishes the access to a subpart of the triangle mesh
 	/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
-	virtual void	unLockVertexBase(int subpart) {(void)subpart;}
+	virtual void unLockVertexBase(int subpart) { (void)subpart; }

-	virtual void	unLockReadOnlyVertexBase(int subpart) const {(void)subpart;}
+	virtual void unLockReadOnlyVertexBase(int subpart) const { (void)subpart; }

 	/// getNumSubParts returns the number of seperate subparts
 	/// each subpart has a continuous array of vertices and indices
-	virtual int		getNumSubParts() const { 
+	virtual int getNumSubParts() const
+	{
 		return (int)m_indexedMeshes.size();
 	}

-	IndexedMeshArray&	getIndexedMeshArray()
+	IndexedMeshArray& getIndexedMeshArray()
 	{
 		return m_indexedMeshes;
 	}

-	const IndexedMeshArray&	getIndexedMeshArray() const
+	const IndexedMeshArray& getIndexedMeshArray() const
 	{
 		return m_indexedMeshes;
 	}

-	virtual void	preallocateVertices(int numverts){(void) numverts;}
-	virtual void	preallocateIndices(int numindices){(void) numindices;}
+	virtual void preallocateVertices(int numverts) { (void)numverts; }
+	virtual void preallocateIndices(int numindices) { (void)numindices; }

-	virtual bool	hasPremadeAabb() const;
-	virtual void	setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax ) const;
-	virtual void	getPremadeAabb(b3Vector3* aabbMin, b3Vector3* aabbMax ) const;
+	virtual bool hasPremadeAabb() const;
+	virtual void setPremadeAabb(const b3Vector3& aabbMin, const b3Vector3& aabbMax) const;
+	virtual void getPremadeAabb(b3Vector3 * aabbMin, b3Vector3 * aabbMax) const;
+};

-}
-;
-
-#endif //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
+#endif  //B3_TRIANGLE_INDEX_VERTEX_ARRAY_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3VectorFloat4.h
@@ -7,5 +7,4 @@
 #define float4 b3Vector3
 //#define make_float4(x,y,z,w) b3Vector4(x,y,z,w)

-
-#endif //B3_VECTOR_FLOAT4_H
+#endif  //B3_VECTOR_FLOAT4_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.cpp
@@ -23,26 +23,24 @@ subject to the following restrictions:
 		
 */

-
 #include "b3VoronoiSimplexSolver.h"

-#define VERTA  0
-#define VERTB  1
-#define VERTC  2
-#define VERTD  3
+#define VERTA 0
+#define VERTB 1
+#define VERTC 2
+#define VERTD 3

 #define B3_CATCH_DEGENERATE_TETRAHEDRON 1
-void	b3VoronoiSimplexSolver::removeVertex(int index)
+void b3VoronoiSimplexSolver::removeVertex(int index)
 {
-	
-	b3Assert(m_numVertices>0);
+	b3Assert(m_numVertices > 0);
 	m_numVertices--;
 	m_simplexVectorW[index] = m_simplexVectorW[m_numVertices];
 	m_simplexPointsP[index] = m_simplexPointsP[m_numVertices];
 	m_simplexPointsQ[index] = m_simplexPointsQ[m_numVertices];
 }

-void	b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts)
+void b3VoronoiSimplexSolver::reduceVertices(const b3UsageBitfield& usedVerts)
 {
 	if ((numVertices() >= 4) && (!usedVerts.usedVertexD))
 		removeVertex(3);
@@ -52,29 +50,22 @@ void	b3VoronoiSimplexSolver::reduceVertices (const b3UsageBitfield& usedVerts)

 	if ((numVertices() >= 2) && (!usedVerts.usedVertexB))
 		removeVertex(1);
-	
+
 	if ((numVertices() >= 1) && (!usedVerts.usedVertexA))
 		removeVertex(0);
-
 }

-
-
-
-
 //clear the simplex, remove all the vertices
 void b3VoronoiSimplexSolver::reset()
 {
 	m_cachedValidClosest = false;
 	m_numVertices = 0;
 	m_needsUpdate = true;
-	m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT),b3Scalar(B3_LARGE_FLOAT));
+	m_lastW = b3MakeVector3(b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT), b3Scalar(B3_LARGE_FLOAT));
 	m_cachedBC.reset();
 }

-
-
-	//add a vertex
+//add a vertex
 void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q)
 {
 	m_lastW = w;
@@ -87,9 +78,8 @@ void b3VoronoiSimplexSolver::addVertex(const b3Vector3& w, const b3Vector3& p, c
 	m_numVertices++;
 }

-bool	b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
+bool b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
 {
-	
 	if (m_needsUpdate)
 	{
 		m_cachedBC.reset();
@@ -98,127 +88,131 @@ bool	b3VoronoiSimplexSolver::updateClosestVectorAndPoints()

 		switch (numVertices())
 		{
-		case 0:
+			case 0:
 				m_cachedValidClosest = false;
 				break;
-		case 1:
+			case 1:
 			{
 				m_cachedP1 = m_simplexPointsP[0];
 				m_cachedP2 = m_simplexPointsQ[0];
-				m_cachedV = m_cachedP1-m_cachedP2; //== m_simplexVectorW[0]
+				m_cachedV = m_cachedP1 - m_cachedP2;  //== m_simplexVectorW[0]
 				m_cachedBC.reset();
-				m_cachedBC.setBarycentricCoordinates(b3Scalar(1.),b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+				m_cachedBC.setBarycentricCoordinates(b3Scalar(1.), b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
 				m_cachedValidClosest = m_cachedBC.isValid();
 				break;
 			};
-		case 2:
+			case 2:
 			{
-			//closest point origin from line segment
-					const b3Vector3& from = m_simplexVectorW[0];
-					const b3Vector3& to = m_simplexVectorW[1];
-					b3Vector3 nearest;
+				//closest point origin from line segment
+				const b3Vector3& from = m_simplexVectorW[0];
+				const b3Vector3& to = m_simplexVectorW[1];
+				b3Vector3 nearest;

-					b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
-					b3Vector3 diff = p - from;
-					b3Vector3 v = to - from;
-					b3Scalar t = v.dot(diff);
-					
-					if (t > 0) {
-						b3Scalar dotVV = v.dot(v);
-						if (t < dotVV) {
-							t /= dotVV;
-							diff -= t*v;
-							m_cachedBC.m_usedVertices.usedVertexA = true;
-							m_cachedBC.m_usedVertices.usedVertexB = true;
-						} else {
-							t = 1;
-							diff -= v;
-							//reduce to 1 point
-							m_cachedBC.m_usedVertices.usedVertexB = true;
-						}
-					} else
+				b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
+				b3Vector3 diff = p - from;
+				b3Vector3 v = to - from;
+				b3Scalar t = v.dot(diff);
+
+				if (t > 0)
+				{
+					b3Scalar dotVV = v.dot(v);
+					if (t < dotVV)
 					{
-						t = 0;
-						//reduce to 1 point
+						t /= dotVV;
+						diff -= t * v;
 						m_cachedBC.m_usedVertices.usedVertexA = true;
+						m_cachedBC.m_usedVertices.usedVertexB = true;
 					}
-					m_cachedBC.setBarycentricCoordinates(1-t,t);
-					nearest = from + t*v;
+					else
+					{
+						t = 1;
+						diff -= v;
+						//reduce to 1 point
+						m_cachedBC.m_usedVertices.usedVertexB = true;
+					}
+				}
+				else
+				{
+					t = 0;
+					//reduce to 1 point
+					m_cachedBC.m_usedVertices.usedVertexA = true;
+				}
+				m_cachedBC.setBarycentricCoordinates(1 - t, t);
+				nearest = from + t * v;

-					m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
-					m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
-					m_cachedV = m_cachedP1 - m_cachedP2;
-					
-					reduceVertices(m_cachedBC.m_usedVertices);
+				m_cachedP1 = m_simplexPointsP[0] + t * (m_simplexPointsP[1] - m_simplexPointsP[0]);
+				m_cachedP2 = m_simplexPointsQ[0] + t * (m_simplexPointsQ[1] - m_simplexPointsQ[0]);
+				m_cachedV = m_cachedP1 - m_cachedP2;

-					m_cachedValidClosest = m_cachedBC.isValid();
-					break;
+				reduceVertices(m_cachedBC.m_usedVertices);
+
+				m_cachedValidClosest = m_cachedBC.isValid();
+				break;
 			}
-		case 3: 
-			{ 
-				//closest point origin from triangle 
-				b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.)); 
-
-				const b3Vector3& a = m_simplexVectorW[0]; 
-				const b3Vector3& b = m_simplexVectorW[1]; 
-				const b3Vector3& c = m_simplexVectorW[2]; 
-
-				closestPtPointTriangle(p,a,b,c,m_cachedBC); 
-				m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] + 
-				m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] + 
-				m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2]; 
-
-				m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] + 
-				m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] + 
-				m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2]; 
-
-				m_cachedV = m_cachedP1-m_cachedP2; 
-
-				reduceVertices (m_cachedBC.m_usedVertices); 
-				m_cachedValidClosest = m_cachedBC.isValid(); 
-
-				break; 
-			}
-		case 4:
+			case 3:
 			{
+				//closest point origin from triangle
+				b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
+
+				const b3Vector3& a = m_simplexVectorW[0];
+				const b3Vector3& b = m_simplexVectorW[1];
+				const b3Vector3& c = m_simplexVectorW[2];
+
+				closestPtPointTriangle(p, a, b, c, m_cachedBC);
+				m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
+							 m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
+							 m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2];
+
+				m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
+							 m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
+							 m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2];
+
+				m_cachedV = m_cachedP1 - m_cachedP2;
+
+				reduceVertices(m_cachedBC.m_usedVertices);
+				m_cachedValidClosest = m_cachedBC.isValid();
+
+				break;
+			}
+			case 4:
+			{
+				b3Vector3 p = b3MakeVector3(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));

-				
-				b3Vector3 p =b3MakeVector3(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
-				
 				const b3Vector3& a = m_simplexVectorW[0];
 				const b3Vector3& b = m_simplexVectorW[1];
 				const b3Vector3& c = m_simplexVectorW[2];
 				const b3Vector3& d = m_simplexVectorW[3];

-				bool hasSeperation = closestPtPointTetrahedron(p,a,b,c,d,m_cachedBC);
+				bool hasSeperation = closestPtPointTetrahedron(p, a, b, c, d, m_cachedBC);

 				if (hasSeperation)
 				{
-
 					m_cachedP1 = m_simplexPointsP[0] * m_cachedBC.m_barycentricCoords[0] +
-						m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
-						m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
-						m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];
+								 m_simplexPointsP[1] * m_cachedBC.m_barycentricCoords[1] +
+								 m_simplexPointsP[2] * m_cachedBC.m_barycentricCoords[2] +
+								 m_simplexPointsP[3] * m_cachedBC.m_barycentricCoords[3];

 					m_cachedP2 = m_simplexPointsQ[0] * m_cachedBC.m_barycentricCoords[0] +
-						m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
-						m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
-						m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];
+								 m_simplexPointsQ[1] * m_cachedBC.m_barycentricCoords[1] +
+								 m_simplexPointsQ[2] * m_cachedBC.m_barycentricCoords[2] +
+								 m_simplexPointsQ[3] * m_cachedBC.m_barycentricCoords[3];

-					m_cachedV = m_cachedP1-m_cachedP2;
-					reduceVertices (m_cachedBC.m_usedVertices);
-				} else
+					m_cachedV = m_cachedP1 - m_cachedP2;
+					reduceVertices(m_cachedBC.m_usedVertices);
+				}
+				else
 				{
-//					printf("sub distance got penetration\n");
+					//					printf("sub distance got penetration\n");

 					if (m_cachedBC.m_degenerate)
 					{
 						m_cachedValidClosest = false;
-					} else
+					}
+					else
 					{
 						m_cachedValidClosest = true;
 						//degenerate case == false, penetration = true + zero
-						m_cachedV.setValue(b3Scalar(0.),b3Scalar(0.),b3Scalar(0.));
+						m_cachedV.setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
 					}
 					break;
 				}
@@ -228,7 +222,7 @@ bool	b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
 				//closest point origin from tetrahedron
 				break;
 			}
-		default:
+			default:
 			{
 				m_cachedValidClosest = false;
 			}
@@ -236,7 +230,6 @@ bool	b3VoronoiSimplexSolver::updateClosestVectorAndPoints()
 	}

 	return m_cachedValidClosest;
-
 }

 //return/calculate the closest vertex
@@ -247,13 +240,11 @@ bool b3VoronoiSimplexSolver::closest(b3Vector3& v)
 	return succes;
 }

-
-
 b3Scalar b3VoronoiSimplexSolver::maxVertex()
 {
 	int i, numverts = numVertices();
 	b3Scalar maxV = b3Scalar(0.);
-	for (i=0;i<numverts;i++)
+	for (i = 0; i < numverts; i++)
 	{
 		b3Scalar curLen2 = m_simplexVectorW[i].length2();
 		if (maxV < curLen2)
@@ -262,13 +253,11 @@ b3Scalar b3VoronoiSimplexSolver::maxVertex()
 	return maxV;
 }

-
-
-	//return the current simplex
-int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const
+//return the current simplex
+int b3VoronoiSimplexSolver::getSimplex(b3Vector3* pBuf, b3Vector3* qBuf, b3Vector3* yBuf) const
 {
 	int i;
-	for (i=0;i<numVertices();i++)
+	for (i = 0; i < numVertices(); i++)
 	{
 		yBuf[i] = m_simplexVectorW[i];
 		pBuf[i] = m_simplexPointsP[i];
@@ -277,20 +266,17 @@ int b3VoronoiSimplexSolver::getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vecto
 	return numVertices();
 }

-
-
-
 bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
 {
 	bool found = false;
 	int i, numverts = numVertices();
 	//b3Scalar maxV = b3Scalar(0.);
-	
+
 	//w is in the current (reduced) simplex
-	for (i=0;i<numverts;i++)
+	for (i = 0; i < numverts; i++)
 	{
 #ifdef BT_USE_EQUAL_VERTEX_THRESHOLD
-		if ( m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
+		if (m_simplexVectorW[i].distance2(w) <= m_equalVertexThreshold)
 #else
 		if (m_simplexVectorW[i] == w)
 #endif
@@ -300,199 +286,190 @@ bool b3VoronoiSimplexSolver::inSimplex(const b3Vector3& w)
 	//check in case lastW is already removed
 	if (w == m_lastW)
 		return true;
-    	
+
 	return found;
 }

-void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v) 
+void b3VoronoiSimplexSolver::backup_closest(b3Vector3& v)
 {
 	v = m_cachedV;
 }

-
-bool b3VoronoiSimplexSolver::emptySimplex() const 
+bool b3VoronoiSimplexSolver::emptySimplex() const
 {
 	return (numVertices() == 0);
-
 }

-void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2) 
+void b3VoronoiSimplexSolver::compute_points(b3Vector3& p1, b3Vector3& p2)
 {
 	updateClosestVectorAndPoints();
 	p1 = m_cachedP1;
 	p2 = m_cachedP2;
-
 }

-
-
-
-bool	b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result)
+bool b3VoronoiSimplexSolver::closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result)
 {
 	result.m_usedVertices.reset();

-    // Check if P in vertex region outside A
-    b3Vector3 ab = b - a;
-    b3Vector3 ac = c - a;
-    b3Vector3 ap = p - a;
-    b3Scalar d1 = ab.dot(ap);
-    b3Scalar d2 = ac.dot(ap);
-    if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0)) 
+	// Check if P in vertex region outside A
+	b3Vector3 ab = b - a;
+	b3Vector3 ac = c - a;
+	b3Vector3 ap = p - a;
+	b3Scalar d1 = ab.dot(ap);
+	b3Scalar d2 = ac.dot(ap);
+	if (d1 <= b3Scalar(0.0) && d2 <= b3Scalar(0.0))
 	{
 		result.m_closestPointOnSimplex = a;
 		result.m_usedVertices.usedVertexA = true;
-		result.setBarycentricCoordinates(1,0,0);
-		return true;// a; // barycentric coordinates (1,0,0)
+		result.setBarycentricCoordinates(1, 0, 0);
+		return true;  // a; // barycentric coordinates (1,0,0)
 	}

-    // Check if P in vertex region outside B
-    b3Vector3 bp = p - b;
-    b3Scalar d3 = ab.dot(bp);
-    b3Scalar d4 = ac.dot(bp);
-    if (d3 >= b3Scalar(0.0) && d4 <= d3) 
+	// Check if P in vertex region outside B
+	b3Vector3 bp = p - b;
+	b3Scalar d3 = ab.dot(bp);
+	b3Scalar d4 = ac.dot(bp);
+	if (d3 >= b3Scalar(0.0) && d4 <= d3)
 	{
 		result.m_closestPointOnSimplex = b;
 		result.m_usedVertices.usedVertexB = true;
-		result.setBarycentricCoordinates(0,1,0);
+		result.setBarycentricCoordinates(0, 1, 0);

-		return true; // b; // barycentric coordinates (0,1,0)
+		return true;  // b; // barycentric coordinates (0,1,0)
 	}
-    // Check if P in edge region of AB, if so return projection of P onto AB
-    b3Scalar vc = d1*d4 - d3*d2;
-    if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0)) {
-        b3Scalar v = d1 / (d1 - d3);
+	// Check if P in edge region of AB, if so return projection of P onto AB
+	b3Scalar vc = d1 * d4 - d3 * d2;
+	if (vc <= b3Scalar(0.0) && d1 >= b3Scalar(0.0) && d3 <= b3Scalar(0.0))
+	{
+		b3Scalar v = d1 / (d1 - d3);
 		result.m_closestPointOnSimplex = a + v * ab;
 		result.m_usedVertices.usedVertexA = true;
 		result.m_usedVertices.usedVertexB = true;
-		result.setBarycentricCoordinates(1-v,v,0);
+		result.setBarycentricCoordinates(1 - v, v, 0);
 		return true;
-        //return a + v * ab; // barycentric coordinates (1-v,v,0)
-    }
+		//return a + v * ab; // barycentric coordinates (1-v,v,0)
+	}

-    // Check if P in vertex region outside C
-    b3Vector3 cp = p - c;
-    b3Scalar d5 = ab.dot(cp);
-    b3Scalar d6 = ac.dot(cp);
-    if (d6 >= b3Scalar(0.0) && d5 <= d6) 
+	// Check if P in vertex region outside C
+	b3Vector3 cp = p - c;
+	b3Scalar d5 = ab.dot(cp);
+	b3Scalar d6 = ac.dot(cp);
+	if (d6 >= b3Scalar(0.0) && d5 <= d6)
 	{
 		result.m_closestPointOnSimplex = c;
 		result.m_usedVertices.usedVertexC = true;
-		result.setBarycentricCoordinates(0,0,1);
-		return true;//c; // barycentric coordinates (0,0,1)
+		result.setBarycentricCoordinates(0, 0, 1);
+		return true;  //c; // barycentric coordinates (0,0,1)
 	}

-    // Check if P in edge region of AC, if so return projection of P onto AC
-    b3Scalar vb = d5*d2 - d1*d6;
-    if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0)) {
-        b3Scalar w = d2 / (d2 - d6);
+	// Check if P in edge region of AC, if so return projection of P onto AC
+	b3Scalar vb = d5 * d2 - d1 * d6;
+	if (vb <= b3Scalar(0.0) && d2 >= b3Scalar(0.0) && d6 <= b3Scalar(0.0))
+	{
+		b3Scalar w = d2 / (d2 - d6);
 		result.m_closestPointOnSimplex = a + w * ac;
 		result.m_usedVertices.usedVertexA = true;
 		result.m_usedVertices.usedVertexC = true;
-		result.setBarycentricCoordinates(1-w,0,w);
+		result.setBarycentricCoordinates(1 - w, 0, w);
 		return true;
-        //return a + w * ac; // barycentric coordinates (1-w,0,w)
-    }
+		//return a + w * ac; // barycentric coordinates (1-w,0,w)
+	}
+
+	// Check if P in edge region of BC, if so return projection of P onto BC
+	b3Scalar va = d3 * d6 - d5 * d4;
+	if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0))
+	{
+		b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));

-    // Check if P in edge region of BC, if so return projection of P onto BC
-    b3Scalar va = d3*d6 - d5*d4;
-    if (va <= b3Scalar(0.0) && (d4 - d3) >= b3Scalar(0.0) && (d5 - d6) >= b3Scalar(0.0)) {
-        b3Scalar w = (d4 - d3) / ((d4 - d3) + (d5 - d6));
-		
 		result.m_closestPointOnSimplex = b + w * (c - b);
 		result.m_usedVertices.usedVertexB = true;
 		result.m_usedVertices.usedVertexC = true;
-		result.setBarycentricCoordinates(0,1-w,w);
-		return true;		
-       // return b + w * (c - b); // barycentric coordinates (0,1-w,w)
-    }
+		result.setBarycentricCoordinates(0, 1 - w, w);
+		return true;
+		// return b + w * (c - b); // barycentric coordinates (0,1-w,w)
+	}
+
+	// P inside face region. Compute Q through its barycentric coordinates (u,v,w)
+	b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
+	b3Scalar v = vb * denom;
+	b3Scalar w = vc * denom;

-    // P inside face region. Compute Q through its barycentric coordinates (u,v,w)
-    b3Scalar denom = b3Scalar(1.0) / (va + vb + vc);
-    b3Scalar v = vb * denom;
-    b3Scalar w = vc * denom;
-    
 	result.m_closestPointOnSimplex = a + ab * v + ac * w;
 	result.m_usedVertices.usedVertexA = true;
 	result.m_usedVertices.usedVertexB = true;
 	result.m_usedVertices.usedVertexC = true;
-	result.setBarycentricCoordinates(1-v-w,v,w);
-	
+	result.setBarycentricCoordinates(1 - v - w, v, w);
+
 	return true;
-//	return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
-
+	//	return a + ab * v + ac * w; // = u*a + v*b + w*c, u = va * denom = b3Scalar(1.0) - v - w
 }

-
-
-
-
 /// Test if point p and d lie on opposite sides of plane through abc
 int b3VoronoiSimplexSolver::pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d)
 {
-	b3Vector3 normal = (b-a).cross(c-a);
+	b3Vector3 normal = (b - a).cross(c - a);

-    b3Scalar signp = (p - a).dot(normal); // [AP AB AC]
-    b3Scalar signd = (d - a).dot( normal); // [AD AB AC]
+	b3Scalar signp = (p - a).dot(normal);  // [AP AB AC]
+	b3Scalar signd = (d - a).dot(normal);  // [AD AB AC]

 #ifdef B3_CATCH_DEGENERATE_TETRAHEDRON
 #ifdef BT_USE_DOUBLE_PRECISION
-if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
+	if (signd * signd < (b3Scalar(1e-8) * b3Scalar(1e-8)))
 	{
 		return -1;
 	}
 #else
 	if (signd * signd < (b3Scalar(1e-4) * b3Scalar(1e-4)))
 	{
-//		printf("affine dependent/degenerate\n");//
+		//		printf("affine dependent/degenerate\n");//
 		return -1;
 	}
 #endif

 #endif
 	// Points on opposite sides if expression signs are opposite
-    return signp * signd < b3Scalar(0.);
+	return signp * signd < b3Scalar(0.);
 }

-
-bool	b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
+bool b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult)
 {
 	b3SubSimplexClosestResult tempResult;

-    // Start out assuming point inside all halfspaces, so closest to itself
+	// Start out assuming point inside all halfspaces, so closest to itself
 	finalResult.m_closestPointOnSimplex = p;
 	finalResult.m_usedVertices.reset();
-    finalResult.m_usedVertices.usedVertexA = true;
+	finalResult.m_usedVertices.usedVertexA = true;
 	finalResult.m_usedVertices.usedVertexB = true;
 	finalResult.m_usedVertices.usedVertexC = true;
 	finalResult.m_usedVertices.usedVertexD = true;

-    int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
+	int pointOutsideABC = pointOutsideOfPlane(p, a, b, c, d);
 	int pointOutsideACD = pointOutsideOfPlane(p, a, c, d, b);
-  	int	pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
-	int	pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);
+	int pointOutsideADB = pointOutsideOfPlane(p, a, d, b, c);
+	int pointOutsideBDC = pointOutsideOfPlane(p, b, d, c, a);

-   if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
-   {
-	   finalResult.m_degenerate = true;
-	   return false;
-   }
-
-   if (!pointOutsideABC  && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
-	 {
-		 return false;
-	 }
-
-
-    b3Scalar bestSqDist = FLT_MAX;
-    // If point outside face abc then compute closest point on abc
-	if (pointOutsideABC) 
+	if (pointOutsideABC < 0 || pointOutsideACD < 0 || pointOutsideADB < 0 || pointOutsideBDC < 0)
 	{
-        closestPtPointTriangle(p, a, b, c,tempResult);
+		finalResult.m_degenerate = true;
+		return false;
+	}
+
+	if (!pointOutsideABC && !pointOutsideACD && !pointOutsideADB && !pointOutsideBDC)
+	{
+		return false;
+	}
+
+	b3Scalar bestSqDist = FLT_MAX;
+	// If point outside face abc then compute closest point on abc
+	if (pointOutsideABC)
+	{
+		closestPtPointTriangle(p, a, b, c, tempResult);
 		b3Vector3 q = tempResult.m_closestPointOnSimplex;
-		
-        b3Scalar sqDist = (q - p).dot( q - p);
-        // Update best closest point if (squared) distance is less than current best
-        if (sqDist < bestSqDist) {
+
+		b3Scalar sqDist = (q - p).dot(q - p);
+		// Update best closest point if (squared) distance is less than current best
+		if (sqDist < bestSqDist)
+		{
 			bestSqDist = sqDist;
 			finalResult.m_closestPointOnSimplex = q;
 			//convert result bitmask!
@@ -501,25 +478,22 @@ bool	b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const
 			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexB;
 			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexC;
 			finalResult.setBarycentricCoordinates(
-					tempResult.m_barycentricCoords[VERTA],
-					tempResult.m_barycentricCoords[VERTB],
-					tempResult.m_barycentricCoords[VERTC],
-					0
-			);
-
+				tempResult.m_barycentricCoords[VERTA],
+				tempResult.m_barycentricCoords[VERTB],
+				tempResult.m_barycentricCoords[VERTC],
+				0);
 		}
-    }
-  
+	}

 	// Repeat test for face acd
-	if (pointOutsideACD) 
+	if (pointOutsideACD)
 	{
-        closestPtPointTriangle(p, a, c, d,tempResult);
+		closestPtPointTriangle(p, a, c, d, tempResult);
 		b3Vector3 q = tempResult.m_closestPointOnSimplex;
 		//convert result bitmask!

-        b3Scalar sqDist = (q - p).dot( q - p);
-        if (sqDist < bestSqDist) 
+		b3Scalar sqDist = (q - p).dot(q - p);
+		if (sqDist < bestSqDist)
 		{
 			bestSqDist = sqDist;
 			finalResult.m_closestPointOnSimplex = q;
@@ -529,52 +503,46 @@ bool	b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const
 			finalResult.m_usedVertices.usedVertexC = tempResult.m_usedVertices.usedVertexB;
 			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexC;
 			finalResult.setBarycentricCoordinates(
-					tempResult.m_barycentricCoords[VERTA],
-					0,
-					tempResult.m_barycentricCoords[VERTB],
-					tempResult.m_barycentricCoords[VERTC]
-			);
-
+				tempResult.m_barycentricCoords[VERTA],
+				0,
+				tempResult.m_barycentricCoords[VERTB],
+				tempResult.m_barycentricCoords[VERTC]);
 		}
-    }
-    // Repeat test for face adb
+	}
+	// Repeat test for face adb

-	
 	if (pointOutsideADB)
 	{
-		closestPtPointTriangle(p, a, d, b,tempResult);
+		closestPtPointTriangle(p, a, d, b, tempResult);
 		b3Vector3 q = tempResult.m_closestPointOnSimplex;
 		//convert result bitmask!

-        b3Scalar sqDist = (q - p).dot( q - p);
-        if (sqDist < bestSqDist) 
+		b3Scalar sqDist = (q - p).dot(q - p);
+		if (sqDist < bestSqDist)
 		{
 			bestSqDist = sqDist;
 			finalResult.m_closestPointOnSimplex = q;
 			finalResult.m_usedVertices.reset();
 			finalResult.m_usedVertices.usedVertexA = tempResult.m_usedVertices.usedVertexA;
 			finalResult.m_usedVertices.usedVertexB = tempResult.m_usedVertices.usedVertexC;
-			
+
 			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;
 			finalResult.setBarycentricCoordinates(
-					tempResult.m_barycentricCoords[VERTA],
-					tempResult.m_barycentricCoords[VERTC],
-					0,
-					tempResult.m_barycentricCoords[VERTB]
-			);
-
+				tempResult.m_barycentricCoords[VERTA],
+				tempResult.m_barycentricCoords[VERTC],
+				0,
+				tempResult.m_barycentricCoords[VERTB]);
 		}
-    }
-    // Repeat test for face bdc
-    
+	}
+	// Repeat test for face bdc

 	if (pointOutsideBDC)
 	{
-        closestPtPointTriangle(p, b, d, c,tempResult);
+		closestPtPointTriangle(p, b, d, c, tempResult);
 		b3Vector3 q = tempResult.m_closestPointOnSimplex;
 		//convert result bitmask!
-        b3Scalar sqDist = (q - p).dot( q - p);
-        if (sqDist < bestSqDist) 
+		b3Scalar sqDist = (q - p).dot(q - p);
+		if (sqDist < bestSqDist)
 		{
 			bestSqDist = sqDist;
 			finalResult.m_closestPointOnSimplex = q;
@@ -585,25 +553,22 @@ bool	b3VoronoiSimplexSolver::closestPtPointTetrahedron(const b3Vector3& p, const
 			finalResult.m_usedVertices.usedVertexD = tempResult.m_usedVertices.usedVertexB;

 			finalResult.setBarycentricCoordinates(
-					0,
-					tempResult.m_barycentricCoords[VERTA],
-					tempResult.m_barycentricCoords[VERTC],
-					tempResult.m_barycentricCoords[VERTB]
-			);
-
+				0,
+				tempResult.m_barycentricCoords[VERTA],
+				tempResult.m_barycentricCoords[VERTC],
+				tempResult.m_barycentricCoords[VERTB]);
 		}
-    }
+	}

 	//help! we ended up full !
-	
+
 	if (finalResult.m_usedVertices.usedVertexA &&
 		finalResult.m_usedVertices.usedVertexB &&
 		finalResult.m_usedVertices.usedVertexC &&
-		finalResult.m_usedVertices.usedVertexD) 
+		finalResult.m_usedVertices.usedVertexD)
 	{
 		return true;
 	}

-    return true;
+	return true;
 }
-
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3VoronoiSimplexSolver.h
@@ -13,22 +13,19 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-
-
 #ifndef B3_VORONOI_SIMPLEX_SOLVER_H
 #define B3_VORONOI_SIMPLEX_SOLVER_H

 #include "Bullet3Common/b3Vector3.h"

-
 #define VORONOI_SIMPLEX_MAX_VERTS 5

 ///disable next define, or use defaultCollisionConfiguration->getSimplexSolver()->setEqualVertexThreshold(0.f) to disable/configure
 //#define BT_USE_EQUAL_VERTEX_THRESHOLD
 #define VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD 0.0001f

-
-struct b3UsageBitfield{
+struct b3UsageBitfield
+{
 	b3UsageBitfield()
 	{
 		reset();
@@ -41,137 +38,127 @@ struct b3UsageBitfield{
 		usedVertexC = false;
 		usedVertexD = false;
 	}
-	unsigned short usedVertexA	: 1;
-	unsigned short usedVertexB	: 1;
-	unsigned short usedVertexC	: 1;
-	unsigned short usedVertexD	: 1;
-	unsigned short unused1		: 1;
-	unsigned short unused2		: 1;
-	unsigned short unused3		: 1;
-	unsigned short unused4		: 1;
+	unsigned short usedVertexA : 1;
+	unsigned short usedVertexB : 1;
+	unsigned short usedVertexC : 1;
+	unsigned short usedVertexD : 1;
+	unsigned short unused1 : 1;
+	unsigned short unused2 : 1;
+	unsigned short unused3 : 1;
+	unsigned short unused4 : 1;
 };

-
-struct	b3SubSimplexClosestResult
+struct b3SubSimplexClosestResult
 {
-	b3Vector3	m_closestPointOnSimplex;
+	b3Vector3 m_closestPointOnSimplex;
 	//MASK for m_usedVertices
-	//stores the simplex vertex-usage, using the MASK, 
+	//stores the simplex vertex-usage, using the MASK,
 	// if m_usedVertices & MASK then the related vertex is used
-	b3UsageBitfield	m_usedVertices;
-	b3Scalar	m_barycentricCoords[4];
+	b3UsageBitfield m_usedVertices;
+	b3Scalar m_barycentricCoords[4];
 	bool m_degenerate;

-	void	reset()
+	void reset()
 	{
 		m_degenerate = false;
 		setBarycentricCoordinates();
 		m_usedVertices.reset();
 	}
-	bool	isValid()
+	bool isValid()
 	{
 		bool valid = (m_barycentricCoords[0] >= b3Scalar(0.)) &&
-			(m_barycentricCoords[1] >= b3Scalar(0.)) &&
-			(m_barycentricCoords[2] >= b3Scalar(0.)) &&
-			(m_barycentricCoords[3] >= b3Scalar(0.));
-
+					 (m_barycentricCoords[1] >= b3Scalar(0.)) &&
+					 (m_barycentricCoords[2] >= b3Scalar(0.)) &&
+					 (m_barycentricCoords[3] >= b3Scalar(0.));

 		return valid;
 	}
-	void	setBarycentricCoordinates(b3Scalar a=b3Scalar(0.),b3Scalar b=b3Scalar(0.),b3Scalar c=b3Scalar(0.),b3Scalar d=b3Scalar(0.))
+	void setBarycentricCoordinates(b3Scalar a = b3Scalar(0.), b3Scalar b = b3Scalar(0.), b3Scalar c = b3Scalar(0.), b3Scalar d = b3Scalar(0.))
 	{
 		m_barycentricCoords[0] = a;
 		m_barycentricCoords[1] = b;
 		m_barycentricCoords[2] = c;
 		m_barycentricCoords[3] = d;
 	}
-
 };

 /// b3VoronoiSimplexSolver is an implementation of the closest point distance algorithm from a 1-4 points simplex to the origin.
 /// Can be used with GJK, as an alternative to Johnson distance algorithm.

-B3_ATTRIBUTE_ALIGNED16(class) b3VoronoiSimplexSolver 
+B3_ATTRIBUTE_ALIGNED16(class)
+b3VoronoiSimplexSolver
 {
 public:
-
 	B3_DECLARE_ALIGNED_ALLOCATOR();

-	int	m_numVertices;
+	int m_numVertices;

-	b3Vector3	m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
-	b3Vector3	m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
-	b3Vector3	m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3 m_simplexVectorW[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3 m_simplexPointsP[VORONOI_SIMPLEX_MAX_VERTS];
+	b3Vector3 m_simplexPointsQ[VORONOI_SIMPLEX_MAX_VERTS];

-	
-
-	b3Vector3	m_cachedP1;
-	b3Vector3	m_cachedP2;
-	b3Vector3	m_cachedV;
-	b3Vector3	m_lastW;
-	
-	b3Scalar	m_equalVertexThreshold;
-	bool		m_cachedValidClosest;
+	b3Vector3 m_cachedP1;
+	b3Vector3 m_cachedP2;
+	b3Vector3 m_cachedV;
+	b3Vector3 m_lastW;

+	b3Scalar m_equalVertexThreshold;
+	bool m_cachedValidClosest;

 	b3SubSimplexClosestResult m_cachedBC;

-	bool	m_needsUpdate;
-	
-	void	removeVertex(int index);
-	void	reduceVertices (const b3UsageBitfield& usedVerts);
-	bool	updateClosestVectorAndPoints();
+	bool m_needsUpdate;

-	bool	closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
-	int		pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
-	bool	closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c,b3SubSimplexClosestResult& result);
+	void removeVertex(int index);
+	void reduceVertices(const b3UsageBitfield& usedVerts);
+	bool updateClosestVectorAndPoints();
+
+	bool closestPtPointTetrahedron(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d, b3SubSimplexClosestResult& finalResult);
+	int pointOutsideOfPlane(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, const b3Vector3& d);
+	bool closestPtPointTriangle(const b3Vector3& p, const b3Vector3& a, const b3Vector3& b, const b3Vector3& c, b3SubSimplexClosestResult& result);

 public:
-
 	b3VoronoiSimplexSolver()
-		:  m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD)
+		: m_equalVertexThreshold(VORONOI_DEFAULT_EQUAL_VERTEX_THRESHOLD)
 	{
 	}
-	 void reset();
+	void reset();

-	 void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);
+	void addVertex(const b3Vector3& w, const b3Vector3& p, const b3Vector3& q);

-	 void	setEqualVertexThreshold(b3Scalar threshold)
-	 {
-		 m_equalVertexThreshold = threshold;
-	 }
+	void setEqualVertexThreshold(b3Scalar threshold)
+	{
+		m_equalVertexThreshold = threshold;
+	}

-	 b3Scalar	getEqualVertexThreshold() const
-	 {
-		 return m_equalVertexThreshold;
-	 }
+	b3Scalar getEqualVertexThreshold() const
+	{
+		return m_equalVertexThreshold;
+	}

-	 bool closest(b3Vector3& v);
+	bool closest(b3Vector3 & v);

-	 b3Scalar maxVertex();
+	b3Scalar maxVertex();

-	 bool fullSimplex() const
-	 {
-		 return (m_numVertices == 4);
-	 }
+	bool fullSimplex() const
+	{
+		return (m_numVertices == 4);
+	}

-	 int getSimplex(b3Vector3 *pBuf, b3Vector3 *qBuf, b3Vector3 *yBuf) const;
+	int getSimplex(b3Vector3 * pBuf, b3Vector3 * qBuf, b3Vector3 * yBuf) const;

-	 bool inSimplex(const b3Vector3& w);
-	
-	 void backup_closest(b3Vector3& v) ;
+	bool inSimplex(const b3Vector3& w);

-	 bool emptySimplex() const ;
+	void backup_closest(b3Vector3 & v);

-	 void compute_points(b3Vector3& p1, b3Vector3& p2) ;
-
-	 int numVertices() const 
-	 {
-		 return m_numVertices;
-	 }
+	bool emptySimplex() const;

+	void compute_points(b3Vector3 & p1, b3Vector3 & p2);

+	int numVertices() const
+	{
+		return m_numVertices;
+	}
 };

-#endif //B3_VORONOI_SIMPLEX_SOLVER_H
-
+#endif  //B3_VORONOI_SIMPLEX_SOLVER_H
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/bvhTraversal.h
@@ -1,258 +1,257 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* bvhTraversalKernelCL= \
-"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
-"//written by Erwin Coumans\n"
-"#define SHAPE_CONVEX_HULL 3\n"
-"#define SHAPE_CONCAVE_TRIMESH 5\n"
-"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
-"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
-"#define SHAPE_SPHERE 7\n"
-"typedef unsigned int u32;\n"
-"#define MAX_NUM_PARTS_IN_BITS 10\n"
-"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
-"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
-"typedef struct\n"
-"{\n"
-"	//12 bytes\n"
-"	unsigned short int	m_quantizedAabbMin[3];\n"
-"	unsigned short int	m_quantizedAabbMax[3];\n"
-"	//4 bytes\n"
-"	int	m_escapeIndexOrTriangleIndex;\n"
-"} btQuantizedBvhNode;\n"
-"typedef struct\n"
-"{\n"
-"	float4		m_aabbMin;\n"
-"	float4		m_aabbMax;\n"
-"	float4		m_quantization;\n"
-"	int			m_numNodes;\n"
-"	int			m_numSubTrees;\n"
-"	int			m_nodeOffset;\n"
-"	int			m_subTreeOffset;\n"
-"} b3BvhInfo;\n"
-"int	getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
-"{\n"
-"	unsigned int x=0;\n"
-"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
-"	// Get only the lower bits where the triangle index is stored\n"
-"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
-"}\n"
-"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
-"{\n"
-"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
-"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
-"}\n"
-"	\n"
-"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
-"{\n"
-"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
-"}\n"
-"typedef struct\n"
-"{\n"
-"	//12 bytes\n"
-"	unsigned short int	m_quantizedAabbMin[3];\n"
-"	unsigned short int	m_quantizedAabbMax[3];\n"
-"	//4 bytes, points to the root of the subtree\n"
-"	int			m_rootNodeIndex;\n"
-"	//4 bytes\n"
-"	int			m_subtreeSize;\n"
-"	int			m_padding[3];\n"
-"} btBvhSubtreeInfo;\n"
-"///keep this in sync with btCollidable.h\n"
-"typedef struct\n"
-"{\n"
-"	int m_numChildShapes;\n"
-"	int blaat2;\n"
-"	int m_shapeType;\n"
-"	int m_shapeIndex;\n"
-"	\n"
-"} btCollidableGpu;\n"
-"typedef struct\n"
-"{\n"
-"	float4	m_childPosition;\n"
-"	float4	m_childOrientation;\n"
-"	int m_shapeIndex;\n"
-"	int m_unused0;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"} btGpuChildShape;\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_pos;\n"
-"	float4 m_quat;\n"
-"	float4 m_linVel;\n"
-"	float4 m_angVel;\n"
-"	u32 m_collidableIdx;\n"
-"	float m_invMass;\n"
-"	float m_restituitionCoeff;\n"
-"	float m_frictionCoeff;\n"
-"} BodyData;\n"
-"typedef struct \n"
-"{\n"
-"	union\n"
-"	{\n"
-"		float4	m_min;\n"
-"		float   m_minElems[4];\n"
-"		int			m_minIndices[4];\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		float4	m_max;\n"
-"		float   m_maxElems[4];\n"
-"		int			m_maxIndices[4];\n"
-"	};\n"
-"} btAabbCL;\n"
-"int testQuantizedAabbAgainstQuantizedAabb(\n"
-"								const unsigned short int* aabbMin1,\n"
-"								const unsigned short int* aabbMax1,\n"
-"								const unsigned short int* aabbMin2,\n"
-"								const unsigned short int* aabbMax2)\n"
-"{\n"
-"	//int overlap = 1;\n"
-"	if (aabbMin1[0] > aabbMax2[0])\n"
-"		return 0;\n"
-"	if (aabbMax1[0] < aabbMin2[0])\n"
-"		return 0;\n"
-"	if (aabbMin1[1] > aabbMax2[1])\n"
-"		return 0;\n"
-"	if (aabbMax1[1] < aabbMin2[1])\n"
-"		return 0;\n"
-"	if (aabbMin1[2] > aabbMax2[2])\n"
-"		return 0;\n"
-"	if (aabbMax1[2] < aabbMin2[2])\n"
-"		return 0;\n"
-"	return 1;\n"
-"	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
-"	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
-"	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
-"	//return overlap;\n"
-"}\n"
-"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
-"{\n"
-"	float4 clampedPoint = max(point2,bvhAabbMin);\n"
-"	clampedPoint = min (clampedPoint, bvhAabbMax);\n"
-"	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
-"	if (isMax)\n"
-"	{\n"
-"		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
-"		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
-"		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
-"	} else\n"
-"	{\n"
-"		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
-"		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
-"		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
-"	}\n"
-"}\n"
-"// work-in-progress\n"
-"__kernel void   bvhTraversalKernel( __global const int4* pairs, \n"
-"									__global const BodyData* rigidBodies, \n"
-"									__global const btCollidableGpu* collidables,\n"
-"									__global btAabbCL* aabbs,\n"
-"									__global int4* concavePairsOut,\n"
-"									__global volatile int* numConcavePairsOut,\n"
-"									__global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
-"									__global const btQuantizedBvhNode* quantizedNodesRoot,\n"
-"									__global const b3BvhInfo* bvhInfos,\n"
-"									int numPairs,\n"
-"									int maxNumConcavePairsCapacity)\n"
-"{\n"
-"	int id = get_global_id(0);\n"
-"	if (id>=numPairs)\n"
-"		return;\n"
-"	\n"
-"	int bodyIndexA = pairs[id].x;\n"
-"	int bodyIndexB = pairs[id].y;\n"
-"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
-"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
-"	\n"
-"	//once the broadphase avoids static-static pairs, we can remove this test\n"
-"	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
-"	{\n"
-"		return;\n"
-"	}\n"
-"		\n"
-"	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
-"		return;\n"
-"	int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
-"		\n"
-"	if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
-"		shapeTypeB!=SHAPE_SPHERE	&&\n"
-"		shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
-"		)\n"
-"		return;\n"
-"	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
-"	float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
-"	float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
-"	float4 bvhQuantization = bvhInfo.m_quantization;\n"
-"	int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
-"	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
-"	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
-"	\n"
-"	unsigned short int quantizedQueryAabbMin[3];\n"
-"	unsigned short int quantizedQueryAabbMax[3];\n"
-"	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
-"	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
-"	\n"
-"	for (int i=0;i<numSubtreeHeaders;i++)\n"
-"	{\n"
-"		btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
-"				\n"
-"		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
-"		if (overlap != 0)\n"
-"		{\n"
-"			int startNodeIndex = subtree.m_rootNodeIndex;\n"
-"			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
-"			int curIndex = startNodeIndex;\n"
-"			int escapeIndex;\n"
-"			int isLeafNode;\n"
-"			int aabbOverlap;\n"
-"			while (curIndex < endNodeIndex)\n"
-"			{\n"
-"				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
-"				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
-"				isLeafNode = isLeaf(&rootNode);\n"
-"				if (aabbOverlap)\n"
-"				{\n"
-"					if (isLeafNode)\n"
-"					{\n"
-"						int triangleIndex = getTriangleIndex(&rootNode);\n"
-"						if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
-"						{\n"
-"								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
-"								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
-"								for (int b=0;b<numChildrenB;b++)\n"
-"								{\n"
-"									if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
-"									{\n"
-"										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
-"										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
-"										concavePairsOut[pairIdx+b] = newPair;\n"
-"									}\n"
-"								}\n"
-"						} else\n"
-"						{\n"
-"							int pairIdx = atomic_inc(numConcavePairsOut);\n"
-"							if (pairIdx<maxNumConcavePairsCapacity)\n"
-"							{\n"
-"								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
-"								concavePairsOut[pairIdx] = newPair;\n"
-"							}\n"
-"						}\n"
-"					} \n"
-"					curIndex++;\n"
-"				} else\n"
-"				{\n"
-"					if (isLeafNode)\n"
-"					{\n"
-"						curIndex++;\n"
-"					} else\n"
-"					{\n"
-"						escapeIndex = getEscapeIndex(&rootNode);\n"
-"						curIndex += escapeIndex;\n"
-"					}\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-;
+static const char* bvhTraversalKernelCL =
+	"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+	"//written by Erwin Coumans\n"
+	"#define SHAPE_CONVEX_HULL 3\n"
+	"#define SHAPE_CONCAVE_TRIMESH 5\n"
+	"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+	"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+	"#define SHAPE_SPHERE 7\n"
+	"typedef unsigned int u32;\n"
+	"#define MAX_NUM_PARTS_IN_BITS 10\n"
+	"///btQuantizedBvhNode is a compressed aabb node, 16 bytes.\n"
+	"///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).\n"
+	"typedef struct\n"
+	"{\n"
+	"	//12 bytes\n"
+	"	unsigned short int	m_quantizedAabbMin[3];\n"
+	"	unsigned short int	m_quantizedAabbMax[3];\n"
+	"	//4 bytes\n"
+	"	int	m_escapeIndexOrTriangleIndex;\n"
+	"} btQuantizedBvhNode;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4		m_aabbMin;\n"
+	"	float4		m_aabbMax;\n"
+	"	float4		m_quantization;\n"
+	"	int			m_numNodes;\n"
+	"	int			m_numSubTrees;\n"
+	"	int			m_nodeOffset;\n"
+	"	int			m_subTreeOffset;\n"
+	"} b3BvhInfo;\n"
+	"int	getTriangleIndex(const btQuantizedBvhNode* rootNode)\n"
+	"{\n"
+	"	unsigned int x=0;\n"
+	"	unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);\n"
+	"	// Get only the lower bits where the triangle index is stored\n"
+	"	return (rootNode->m_escapeIndexOrTriangleIndex&~(y));\n"
+	"}\n"
+	"int isLeaf(const btQuantizedBvhNode* rootNode)\n"
+	"{\n"
+	"	//skipindex is negative (internal node), triangleindex >=0 (leafnode)\n"
+	"	return (rootNode->m_escapeIndexOrTriangleIndex >= 0)? 1 : 0;\n"
+	"}\n"
+	"	\n"
+	"int getEscapeIndex(const btQuantizedBvhNode* rootNode)\n"
+	"{\n"
+	"	return -rootNode->m_escapeIndexOrTriangleIndex;\n"
+	"}\n"
+	"typedef struct\n"
+	"{\n"
+	"	//12 bytes\n"
+	"	unsigned short int	m_quantizedAabbMin[3];\n"
+	"	unsigned short int	m_quantizedAabbMax[3];\n"
+	"	//4 bytes, points to the root of the subtree\n"
+	"	int			m_rootNodeIndex;\n"
+	"	//4 bytes\n"
+	"	int			m_subtreeSize;\n"
+	"	int			m_padding[3];\n"
+	"} btBvhSubtreeInfo;\n"
+	"///keep this in sync with btCollidable.h\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_numChildShapes;\n"
+	"	int blaat2;\n"
+	"	int m_shapeType;\n"
+	"	int m_shapeIndex;\n"
+	"	\n"
+	"} btCollidableGpu;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4	m_childPosition;\n"
+	"	float4	m_childOrientation;\n"
+	"	int m_shapeIndex;\n"
+	"	int m_unused0;\n"
+	"	int m_unused1;\n"
+	"	int m_unused2;\n"
+	"} btGpuChildShape;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_pos;\n"
+	"	float4 m_quat;\n"
+	"	float4 m_linVel;\n"
+	"	float4 m_angVel;\n"
+	"	u32 m_collidableIdx;\n"
+	"	float m_invMass;\n"
+	"	float m_restituitionCoeff;\n"
+	"	float m_frictionCoeff;\n"
+	"} BodyData;\n"
+	"typedef struct \n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_min;\n"
+	"		float   m_minElems[4];\n"
+	"		int			m_minIndices[4];\n"
+	"	};\n"
+	"	union\n"
+	"	{\n"
+	"		float4	m_max;\n"
+	"		float   m_maxElems[4];\n"
+	"		int			m_maxIndices[4];\n"
+	"	};\n"
+	"} btAabbCL;\n"
+	"int testQuantizedAabbAgainstQuantizedAabb(\n"
+	"								const unsigned short int* aabbMin1,\n"
+	"								const unsigned short int* aabbMax1,\n"
+	"								const unsigned short int* aabbMin2,\n"
+	"								const unsigned short int* aabbMax2)\n"
+	"{\n"
+	"	//int overlap = 1;\n"
+	"	if (aabbMin1[0] > aabbMax2[0])\n"
+	"		return 0;\n"
+	"	if (aabbMax1[0] < aabbMin2[0])\n"
+	"		return 0;\n"
+	"	if (aabbMin1[1] > aabbMax2[1])\n"
+	"		return 0;\n"
+	"	if (aabbMax1[1] < aabbMin2[1])\n"
+	"		return 0;\n"
+	"	if (aabbMin1[2] > aabbMax2[2])\n"
+	"		return 0;\n"
+	"	if (aabbMax1[2] < aabbMin2[2])\n"
+	"		return 0;\n"
+	"	return 1;\n"
+	"	//overlap = ((aabbMin1[0] > aabbMax2[0]) || (aabbMax1[0] < aabbMin2[0])) ? 0 : overlap;\n"
+	"	//overlap = ((aabbMin1[2] > aabbMax2[2]) || (aabbMax1[2] < aabbMin2[2])) ? 0 : overlap;\n"
+	"	//overlap = ((aabbMin1[1] > aabbMax2[1]) || (aabbMax1[1] < aabbMin2[1])) ? 0 : overlap;\n"
+	"	//return overlap;\n"
+	"}\n"
+	"void quantizeWithClamp(unsigned short* out, float4 point2,int isMax, float4 bvhAabbMin, float4 bvhAabbMax, float4 bvhQuantization)\n"
+	"{\n"
+	"	float4 clampedPoint = max(point2,bvhAabbMin);\n"
+	"	clampedPoint = min (clampedPoint, bvhAabbMax);\n"
+	"	float4 v = (clampedPoint - bvhAabbMin) * bvhQuantization;\n"
+	"	if (isMax)\n"
+	"	{\n"
+	"		out[0] = (unsigned short) (((unsigned short)(v.x+1.f) | 1));\n"
+	"		out[1] = (unsigned short) (((unsigned short)(v.y+1.f) | 1));\n"
+	"		out[2] = (unsigned short) (((unsigned short)(v.z+1.f) | 1));\n"
+	"	} else\n"
+	"	{\n"
+	"		out[0] = (unsigned short) (((unsigned short)(v.x) & 0xfffe));\n"
+	"		out[1] = (unsigned short) (((unsigned short)(v.y) & 0xfffe));\n"
+	"		out[2] = (unsigned short) (((unsigned short)(v.z) & 0xfffe));\n"
+	"	}\n"
+	"}\n"
+	"// work-in-progress\n"
+	"__kernel void   bvhTraversalKernel( __global const int4* pairs, \n"
+	"									__global const BodyData* rigidBodies, \n"
+	"									__global const btCollidableGpu* collidables,\n"
+	"									__global btAabbCL* aabbs,\n"
+	"									__global int4* concavePairsOut,\n"
+	"									__global volatile int* numConcavePairsOut,\n"
+	"									__global const btBvhSubtreeInfo* subtreeHeadersRoot,\n"
+	"									__global const btQuantizedBvhNode* quantizedNodesRoot,\n"
+	"									__global const b3BvhInfo* bvhInfos,\n"
+	"									int numPairs,\n"
+	"									int maxNumConcavePairsCapacity)\n"
+	"{\n"
+	"	int id = get_global_id(0);\n"
+	"	if (id>=numPairs)\n"
+	"		return;\n"
+	"	\n"
+	"	int bodyIndexA = pairs[id].x;\n"
+	"	int bodyIndexB = pairs[id].y;\n"
+	"	int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+	"	int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+	"	\n"
+	"	//once the broadphase avoids static-static pairs, we can remove this test\n"
+	"	if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+	"	{\n"
+	"		return;\n"
+	"	}\n"
+	"		\n"
+	"	if (collidables[collidableIndexA].m_shapeType!=SHAPE_CONCAVE_TRIMESH)\n"
+	"		return;\n"
+	"	int shapeTypeB = collidables[collidableIndexB].m_shapeType;\n"
+	"		\n"
+	"	if (shapeTypeB!=SHAPE_CONVEX_HULL &&\n"
+	"		shapeTypeB!=SHAPE_SPHERE	&&\n"
+	"		shapeTypeB!=SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
+	"		)\n"
+	"		return;\n"
+	"	b3BvhInfo bvhInfo = bvhInfos[collidables[collidableIndexA].m_numChildShapes];\n"
+	"	float4 bvhAabbMin = bvhInfo.m_aabbMin;\n"
+	"	float4 bvhAabbMax = bvhInfo.m_aabbMax;\n"
+	"	float4 bvhQuantization = bvhInfo.m_quantization;\n"
+	"	int numSubtreeHeaders = bvhInfo.m_numSubTrees;\n"
+	"	__global const btBvhSubtreeInfo* subtreeHeaders = &subtreeHeadersRoot[bvhInfo.m_subTreeOffset];\n"
+	"	__global const btQuantizedBvhNode* quantizedNodes = &quantizedNodesRoot[bvhInfo.m_nodeOffset];\n"
+	"	\n"
+	"	unsigned short int quantizedQueryAabbMin[3];\n"
+	"	unsigned short int quantizedQueryAabbMax[3];\n"
+	"	quantizeWithClamp(quantizedQueryAabbMin,aabbs[bodyIndexB].m_min,false,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
+	"	quantizeWithClamp(quantizedQueryAabbMax,aabbs[bodyIndexB].m_max,true ,bvhAabbMin, bvhAabbMax,bvhQuantization);\n"
+	"	\n"
+	"	for (int i=0;i<numSubtreeHeaders;i++)\n"
+	"	{\n"
+	"		btBvhSubtreeInfo subtree = subtreeHeaders[i];\n"
+	"				\n"
+	"		int overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);\n"
+	"		if (overlap != 0)\n"
+	"		{\n"
+	"			int startNodeIndex = subtree.m_rootNodeIndex;\n"
+	"			int endNodeIndex = subtree.m_rootNodeIndex+subtree.m_subtreeSize;\n"
+	"			int curIndex = startNodeIndex;\n"
+	"			int escapeIndex;\n"
+	"			int isLeafNode;\n"
+	"			int aabbOverlap;\n"
+	"			while (curIndex < endNodeIndex)\n"
+	"			{\n"
+	"				btQuantizedBvhNode rootNode = quantizedNodes[curIndex];\n"
+	"				aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode.m_quantizedAabbMin,rootNode.m_quantizedAabbMax);\n"
+	"				isLeafNode = isLeaf(&rootNode);\n"
+	"				if (aabbOverlap)\n"
+	"				{\n"
+	"					if (isLeafNode)\n"
+	"					{\n"
+	"						int triangleIndex = getTriangleIndex(&rootNode);\n"
+	"						if (shapeTypeB==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+	"						{\n"
+	"								int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+	"								int pairIdx = atomic_add(numConcavePairsOut,numChildrenB);\n"
+	"								for (int b=0;b<numChildrenB;b++)\n"
+	"								{\n"
+	"									if ((pairIdx+b)<maxNumConcavePairsCapacity)\n"
+	"									{\n"
+	"										int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+	"										int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,childShapeIndexB);\n"
+	"										concavePairsOut[pairIdx+b] = newPair;\n"
+	"									}\n"
+	"								}\n"
+	"						} else\n"
+	"						{\n"
+	"							int pairIdx = atomic_inc(numConcavePairsOut);\n"
+	"							if (pairIdx<maxNumConcavePairsCapacity)\n"
+	"							{\n"
+	"								int4 newPair = (int4)(bodyIndexA,bodyIndexB,triangleIndex,0);\n"
+	"								concavePairsOut[pairIdx] = newPair;\n"
+	"							}\n"
+	"						}\n"
+	"					} \n"
+	"					curIndex++;\n"
+	"				} else\n"
+	"				{\n"
+	"					if (isLeafNode)\n"
+	"					{\n"
+	"						curIndex++;\n"
+	"					} else\n"
+	"					{\n"
+	"						escapeIndex = getEscapeIndex(&rootNode);\n"
+	"						curIndex += escapeIndex;\n"
+	"					}\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
@@ -19,149 +19,139 @@ subject to the following restrictions:
 #define KERNEL1 "SearchSortDataUpperKernel"
 #define KERNEL2 "SubtractKernel"

-
 #include "b3BoundSearchCL.h"
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "b3LauncherCL.h"
 #include "kernels/BoundSearchKernelsCL.h"

 b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
-	:m_context(ctx),
-	m_device(device),
-	m_queue(queue)
+	: m_context(ctx),
+	  m_device(device),
+	  m_queue(queue)
 {
-
 	const char* additionalMacros = "";
 	//const char* srcFileNameForCaching="";

 	cl_int pErrNum;
 	const char* kernelSource = boundSearchKernelsCL;

-	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
+	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, BOUNDSEARCH_PATH);
 	b3Assert(boundSearchProg);

-	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
-	b3Assert(m_lowerSortDataKernel );
+	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg, additionalMacros);
+	b3Assert(m_lowerSortDataKernel);

-	m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
+	m_upperSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg, additionalMacros);
 	b3Assert(m_upperSortDataKernel);

 	m_subtractKernel = 0;

-	if( maxSize )
+	if (maxSize)
 	{
-		m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
+		m_subtractKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg, additionalMacros);
 		b3Assert(m_subtractKernel);
 	}

 	//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
-	
-	m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize );
-	m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize );

-	m_filler = new b3FillCL(ctx,device,queue);
+	m_lower = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
+	m_upper = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
+
+	m_filler = new b3FillCL(ctx, device, queue);
 }

 b3BoundSearchCL::~b3BoundSearchCL()
 {
-	
 	delete m_lower;
 	delete m_upper;
 	delete m_filler;
-			
+
 	clReleaseKernel(m_lowerSortDataKernel);
 	clReleaseKernel(m_upperSortDataKernel);
 	clReleaseKernel(m_subtractKernel);
-	
-
 }

-
-void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option )
+void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option)
 {
 	b3Int4 constBuffer;
 	constBuffer.x = nSrc;
 	constBuffer.y = nDst;

-	if( option == BOUND_LOWER )
+	if (option == BOUND_LOWER)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};

-		b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( nSrc );
-        launcher.setConst( nDst );
-        
-		launcher.launch1D( nSrc, 64 );
+		b3LauncherCL launcher(m_queue, m_lowerSortDataKernel, "m_lowerSortDataKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(nSrc);
+		launcher.setConst(nDst);
+
+		launcher.launch1D(nSrc, 64);
 	}
-	else if( option == BOUND_UPPER )
+	else if (option == BOUND_UPPER)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};

-		b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-        launcher.setConst( nSrc );
-        launcher.setConst( nDst );
+		b3LauncherCL launcher(m_queue, m_upperSortDataKernel, "m_upperSortDataKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(nSrc);
+		launcher.setConst(nDst);

-		launcher.launch1D( nSrc, 64 );
+		launcher.launch1D(nSrc, 64);
 	}
-	else if( option == COUNT )
+	else if (option == COUNT)
 	{
-		b3Assert( m_lower );
-		b3Assert( m_upper );
-		b3Assert( m_lower->capacity() <= (int)nDst );
-		b3Assert( m_upper->capacity() <= (int)nDst );
+		b3Assert(m_lower);
+		b3Assert(m_upper);
+		b3Assert(m_lower->capacity() <= (int)nDst);
+		b3Assert(m_upper->capacity() <= (int)nDst);

 		int zero = 0;
-		m_filler->execute( *m_lower, zero, nDst );
-		m_filler->execute( *m_upper, zero, nDst );
+		m_filler->execute(*m_lower, zero, nDst);
+		m_filler->execute(*m_upper, zero, nDst);

-		execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
-		execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
+		execute(src, nSrc, *m_lower, nDst, BOUND_LOWER);
+		execute(src, nSrc, *m_upper, nDst, BOUND_UPPER);

 		{
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_upper->getBufferCL(), true), b3BufferInfoCL(m_lower->getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};

-			b3LauncherCL  launcher( m_queue, m_subtractKernel ,"m_subtractKernel");
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-            launcher.setConst( nSrc );
-            launcher.setConst( nDst );
+			b3LauncherCL launcher(m_queue, m_subtractKernel, "m_subtractKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(nSrc);
+			launcher.setConst(nDst);

-			launcher.launch1D( nDst, 64 );
+			launcher.launch1D(nDst, 64);
 		}
 	}
 	else
 	{
-		b3Assert( 0 );
+		b3Assert(0);
 	}
-
 }

-
-void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, 
-	b3AlignedObjectArray<unsigned int>& dst,  int nDst, Option option )
+void b3BoundSearchCL::executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc,
+								  b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option)
 {
+	for (int i = 0; i < nSrc - 1; i++)
+		b3Assert(src[i].m_key <= src[i + 1].m_key);

-
-	for(int i=0; i<nSrc-1; i++) 
-		b3Assert( src[i].m_key <= src[i+1].m_key );
-
-	b3SortData minData,zeroData,maxData;
+	b3SortData minData, zeroData, maxData;
 	minData.m_key = -1;
 	minData.m_value = -1;
-	zeroData.m_key=0;
-	zeroData.m_value=0;
+	zeroData.m_key = 0;
+	zeroData.m_value = 0;
 	maxData.m_key = nDst;
 	maxData.m_value = nDst;

-	if( option == BOUND_LOWER )
+	if (option == BOUND_LOWER)
 	{
-		for(int i=0; i<nSrc; i++)
+		for (int i = 0; i < nSrc; i++)
 		{
-			b3SortData& iData = (i==0)? minData: src[i-1];
-			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			b3SortData& iData = (i == 0) ? minData : src[i - 1];
+			b3SortData& jData = (i == nSrc) ? maxData : src[i];

-			if( iData.m_key != jData.m_key )
+			if (iData.m_key != jData.m_key)
 			{
 				int k = jData.m_key;
 				{
@@ -170,14 +160,14 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
 			}
 		}
 	}
-	else if( option == BOUND_UPPER )
+	else if (option == BOUND_UPPER)
 	{
-		for(int i=1; i<nSrc+1; i++)
+		for (int i = 1; i < nSrc + 1; i++)
 		{
-			b3SortData& iData = src[i-1];
-			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			b3SortData& iData = src[i - 1];
+			b3SortData& jData = (i == nSrc) ? maxData : src[i];

-			if( iData.m_key != jData.m_key )
+			if (iData.m_key != jData.m_key)
 			{
 				int k = iData.m_key;
 				{
@@ -186,28 +176,28 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
 			}
 		}
 	}
-	else if( option == COUNT )
+	else if (option == COUNT)
 	{
 		b3AlignedObjectArray<unsigned int> lower;
-		lower.resize(nDst );
+		lower.resize(nDst);
 		b3AlignedObjectArray<unsigned int> upper;
-		upper.resize(nDst );
+		upper.resize(nDst);

-		for(int i=0; i<nDst; i++) 
-		{ 
-			lower[i] = upper[i] = 0; 
+		for (int i = 0; i < nDst; i++)
+		{
+			lower[i] = upper[i] = 0;
 		}

-		executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
-		executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
+		executeHost(src, nSrc, lower, nDst, BOUND_LOWER);
+		executeHost(src, nSrc, upper, nDst, BOUND_UPPER);

-		for( int i=0; i<nDst; i++) 
-		{ 
-			dst[i] = upper[i] - lower[i]; 
+		for (int i = 0; i < nDst; i++)
+		{
+			dst[i] = upper[i] - lower[i];
 		}
 	}
 	else
 	{
-		b3Assert( 0 );
+		b3Assert(0);
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
@@ -26,42 +26,39 @@ subject to the following restrictions:

 #include "b3OpenCLArray.h"
 #include "b3FillCL.h"
-#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
+#include "b3RadixSort32CL.h"  //for b3SortData (perhaps move it?)
 class b3BoundSearchCL
 {
-	public:
+public:
+	enum Option
+	{
+		BOUND_LOWER,
+		BOUND_UPPER,
+		COUNT,
+	};

-		enum Option
-		{
-			BOUND_LOWER,
-			BOUND_UPPER,
-			COUNT,
-		};
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;

-		cl_context m_context;
-		cl_device_id m_device;
-		cl_command_queue m_queue;
+	cl_kernel m_lowerSortDataKernel;
+	cl_kernel m_upperSortDataKernel;
+	cl_kernel m_subtractKernel;

-		
-		cl_kernel m_lowerSortDataKernel;
-		cl_kernel m_upperSortDataKernel;
-		cl_kernel m_subtractKernel;
-		
-		b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
-		b3OpenCLArray<unsigned int>* m_lower;
-		b3OpenCLArray<unsigned int>* m_upper;
-		
-		b3FillCL* m_filler;
-		
-		b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
+	b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
+	b3OpenCLArray<unsigned int>* m_lower;
+	b3OpenCLArray<unsigned int>* m_upper;

-		virtual ~b3BoundSearchCL();
+	b3FillCL* m_filler;

-		//	src has to be src[i].m_key <= src[i+1].m_key
-		void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
+	b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);

-		void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
+	virtual ~b3BoundSearchCL();
+
+	//	src has to be src[i].m_key <= src[i+1].m_key
+	void execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
+
+	void executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
 };

-
-#endif //B3_BOUNDSEARCH_H
+#endif  //B3_BOUNDSEARCH_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
@@ -4,16 +4,15 @@

 #include "b3OpenCLArray.h"

-
 struct b3BufferInfoCL
 {
 	//b3BufferInfoCL(){}

-//	template<typename T>
-	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
+	//	template<typename T>
+	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false) : m_clBuffer(buff), m_isReadOnly(isReadOnly) {}

 	cl_mem m_clBuffer;
 	bool m_isReadOnly;
 };

-#endif //B3_BUFFER_INFO_CL_H
+#endif  //B3_BUFFER_INFO_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
@@ -8,29 +8,26 @@
 #include "kernels/FillKernelsCL.h"

 b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* kernelSource = fillKernelsCL;
 	cl_int pErrNum;
 	const char* additionalMacros = "";

-	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
+	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, FILL_CL_PROGRAM_PATH);
 	b3Assert(fillProg);

-	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillIntKernel);

-	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillIntKernel);

-	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillFloatKernel);

-	
-
-	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
+	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillKernelInt2);
-	
 }

 b3FillCL::~b3FillCL()
@@ -39,88 +36,84 @@ b3FillCL::~b3FillCL()
 	clReleaseKernel(m_fillIntKernel);
 	clReleaseKernel(m_fillUnsignedIntKernel);
 	clReleaseKernel(m_fillFloatKernel);
-
 }

 void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
 {
-	b3Assert( n>0 );
+	b3Assert(n > 0);

 	{
-		b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" );
-		launcher.setBuffer( src.getBufferCL());
-		launcher.setConst( n );
-		launcher.setConst( value );
-		launcher.setConst( offset);
+		b3LauncherCL launcher(m_commandQueue, m_fillFloatKernel, "m_fillFloatKernel");
+		launcher.setBuffer(src.getBufferCL());
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);

-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }

 void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
 {
-	b3Assert( n>0 );
-	
+	b3Assert(n > 0);

 	{
-		b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel");
+		b3LauncherCL launcher(m_commandQueue, m_fillIntKernel, "m_fillIntKernel");
 		launcher.setBuffer(src.getBufferCL());
-		launcher.setConst( n);
-		launcher.setConst( value);
-		launcher.setConst( offset);
-		launcher.launch1D( n );
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);
+		launcher.launch1D(n);
 	}
 }

-
 void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
 {
-	b3Assert( n>0 );
+	b3Assert(n > 0);

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};

-		b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( n );
-        launcher.setConst(value);
+		b3LauncherCL launcher(m_commandQueue, m_fillUnsignedIntKernel, "m_fillUnsignedIntKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(n);
+		launcher.setConst(value);
 		launcher.setConst(offset);

-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }

-void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset)
 {
-	for (int i=0;i<n;i++)
+	for (int i = 0; i < n; i++)
 	{
-		src[i+offset]=value;
+		src[i + offset] = value;
 	}
 }

-void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
+void b3FillCL::executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset)
 {
-	for (int i=0;i<n;i++)
+	for (int i = 0; i < n; i++)
 	{
-		src[i+offset]=value;
+		src[i + offset] = value;
 	}
 }

-void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+void b3FillCL::execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset)
 {
-	b3Assert( n>0 );
-	
+	b3Assert(n > 0);

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};

-		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2, "m_fillKernelInt2");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(n);
 		launcher.setConst(value);
 		launcher.setConst(offset);

 		//( constBuffer );
-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
@@ -7,57 +7,46 @@
 #include "Bullet3Common/shared/b3Int2.h"
 #include "Bullet3Common/shared/b3Int4.h"

-
 class b3FillCL
 {
-	
-	cl_command_queue	m_commandQueue;
-	
-	cl_kernel			m_fillKernelInt2;
-	cl_kernel			m_fillIntKernel;
-	cl_kernel			m_fillUnsignedIntKernel;
-	cl_kernel			m_fillFloatKernel;
+	cl_command_queue m_commandQueue;

-	public:
-		
-		struct b3ConstData
-		{
-			union
-			{
-				b3Int4 m_data;
-				b3UnsignedInt4 m_UnsignedData;
-			};
-			int m_offset;
-			int m_n;
-			int m_padding[2];
-		};
-
-protected:
+	cl_kernel m_fillKernelInt2;
+	cl_kernel m_fillIntKernel;
+	cl_kernel m_fillUnsignedIntKernel;
+	cl_kernel m_fillFloatKernel;

 public:
+	struct b3ConstData
+	{
+		union {
+			b3Int4 m_data;
+			b3UnsignedInt4 m_UnsignedData;
+		};
+		int m_offset;
+		int m_n;
+		int m_padding[2];
+	};

-		b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
+protected:
+public:
+	b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);

-		virtual ~b3FillCL();
+	virtual ~b3FillCL();

-		void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
-	
-		void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
+	void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);

-		void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
+	void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);

-		void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
+	void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);

-		void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset);
+	void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);

-		void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
+	void executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset);
+
+	void executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset);

 	//	void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
-
 };
-		
-		
-		
-	

-#endif //B3_FILL_CL_H
+#endif  //B3_FILL_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
@@ -1,13 +1,13 @@
 #include "b3LauncherCL.h"

 bool gDebugLauncherCL = false;
-    
+
 b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name)
-:m_commandQueue(queue),
-m_kernel(kernel),
-m_idx(0),
-m_enableSerialization(false),
-m_name(name)
+	: m_commandQueue(queue),
+	  m_kernel(kernel),
+	  m_idx(0),
+	  m_enableSerialization(false),
+	  m_name(name)
 {
 	if (gDebugLauncherCL)
 	{
@@ -15,59 +15,58 @@ m_name(name)
 		printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name);
 	}

-      m_serializationSizeInBytes = sizeof(int);
+	m_serializationSizeInBytes = sizeof(int);
 }
-    
+
 b3LauncherCL::~b3LauncherCL()
-  {
-      for (int i=0;i<m_arrays.size();i++)
-      {
-		  delete (m_arrays[i]);
-      }
-
-	  m_arrays.clear();
-	  if (gDebugLauncherCL)
-	  {
-		static int counter = 0;
-		printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name);
-	  }
-  }
-
-void b3LauncherCL::setBuffer( cl_mem clBuffer)
 {
-		if (m_enableSerialization)
-		{
-			b3KernelArgData kernelArg;
-			kernelArg.m_argIndex = m_idx;
-			kernelArg.m_isBuffer = 1;
-			kernelArg.m_clBuffer = clBuffer;
-		
-			cl_mem_info param_name = CL_MEM_SIZE;
-			size_t param_value;
-			size_t sizeInBytes = sizeof(size_t);
-			size_t actualSizeInBytes;
-			cl_int err;
-			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
-									  param_name,
-									  sizeInBytes,
-									  &param_value,
-									  &actualSizeInBytes);
-			
-			b3Assert( err == CL_SUCCESS );
-			kernelArg.m_argSizeInBytes = param_value;
-			
-			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
-			m_serializationSizeInBytes+=param_value;
-            }
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
-		b3Assert( status == CL_SUCCESS );
+	for (int i = 0; i < m_arrays.size(); i++)
+	{
+		delete (m_arrays[i]);
+	}
+
+	m_arrays.clear();
+	if (gDebugLauncherCL)
+	{
+		static int counter = 0;
+		printf("[%d] Finished launching OpenCL kernel %s\n", counter++, m_name);
+	}
 }

-
-void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
+void b3LauncherCL::setBuffer(cl_mem clBuffer)
 {
-	for(int i=0; i<n; i++)
+	if (m_enableSerialization)
+	{
+		b3KernelArgData kernelArg;
+		kernelArg.m_argIndex = m_idx;
+		kernelArg.m_isBuffer = 1;
+		kernelArg.m_clBuffer = clBuffer;
+
+		cl_mem_info param_name = CL_MEM_SIZE;
+		size_t param_value;
+		size_t sizeInBytes = sizeof(size_t);
+		size_t actualSizeInBytes;
+		cl_int err;
+		err = clGetMemObjectInfo(kernelArg.m_clBuffer,
+								 param_name,
+								 sizeInBytes,
+								 &param_value,
+								 &actualSizeInBytes);
+
+		b3Assert(err == CL_SUCCESS);
+		kernelArg.m_argSizeInBytes = param_value;
+
+		m_kernelArguments.push_back(kernelArg);
+		m_serializationSizeInBytes += sizeof(b3KernelArgData);
+		m_serializationSizeInBytes += param_value;
+	}
+	cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
+	b3Assert(status == CL_SUCCESS);
+}
+
+void b3LauncherCL::setBuffers(b3BufferInfoCL* buffInfo, int n)
+{
+	for (int i = 0; i < n; i++)
 	{
 		if (m_enableSerialization)
 		{
@@ -75,106 +74,103 @@ void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
 			kernelArg.m_argIndex = m_idx;
 			kernelArg.m_isBuffer = 1;
 			kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
-		
+
 			cl_mem_info param_name = CL_MEM_SIZE;
 			size_t param_value;
 			size_t sizeInBytes = sizeof(size_t);
 			size_t actualSizeInBytes;
 			cl_int err;
-			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
-									  param_name,
-									  sizeInBytes,
-									  &param_value,
-									  &actualSizeInBytes);
-			
-			b3Assert( err == CL_SUCCESS );
+			err = clGetMemObjectInfo(kernelArg.m_clBuffer,
+									 param_name,
+									 sizeInBytes,
+									 &param_value,
+									 &actualSizeInBytes);
+
+			b3Assert(err == CL_SUCCESS);
 			kernelArg.m_argSizeInBytes = param_value;
-			
+
 			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
-			m_serializationSizeInBytes+=param_value;
-            }
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
-		b3Assert( status == CL_SUCCESS );
-        }
+			m_serializationSizeInBytes += sizeof(b3KernelArgData);
+			m_serializationSizeInBytes += param_value;
+		}
+		cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
+		b3Assert(status == CL_SUCCESS);
+	}
 }

 struct b3KernelArgDataUnaligned
 {
-    int m_isBuffer;
-    int m_argIndex;
-    int m_argSizeInBytes;
+	int m_isBuffer;
+	int m_argIndex;
+	int m_argSizeInBytes;
 	int m_unusedPadding;
-    union
-    {
-        cl_mem m_clBuffer;
-        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-    };
-    
+	union {
+		cl_mem m_clBuffer;
+		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+	};
 };
 #include <string.h>

-
-
 int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
 {
-    int index=0;
-    
-    int numArguments = *(int*) &buf[index];
-    index+=sizeof(int);
-    
-    for (int i=0;i<numArguments;i++)
-    {
-        b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
+	int index = 0;

-        index+=sizeof(b3KernelArgData);
-        if (arg->m_isBuffer)
-        {
-            b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
-            clData->resize(arg->m_argSizeInBytes);
-            
-            clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
-            
-            arg->m_clBuffer = clData->getBufferCL();
-            
-            m_arrays.push_back(clData);
-            
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
-		b3Assert( status == CL_SUCCESS );
-            index+=arg->m_argSizeInBytes;
-        } else 
-        {
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
-		b3Assert( status == CL_SUCCESS );
-        }
+	int numArguments = *(int*)&buf[index];
+	index += sizeof(int);
+
+	for (int i = 0; i < numArguments; i++)
+	{
+		b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
+
+		index += sizeof(b3KernelArgData);
+		if (arg->m_isBuffer)
+		{
+			b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx, m_commandQueue, arg->m_argSizeInBytes);
+			clData->resize(arg->m_argSizeInBytes);
+
+			clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
+
+			arg->m_clBuffer = clData->getBufferCL();
+
+			m_arrays.push_back(clData);
+
+			cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
+			b3Assert(status == CL_SUCCESS);
+			index += arg->m_argSizeInBytes;
+		}
+		else
+		{
+			cl_int status = clSetKernelArg(m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
+			b3Assert(status == CL_SUCCESS);
+		}
 		b3KernelArgData b;
-		memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned));
-	m_kernelArguments.push_back(b);
-    }
-m_serializationSizeInBytes = index;
-    return index;
+		memcpy(&b, arg, sizeof(b3KernelArgDataUnaligned));
+		m_kernelArguments.push_back(b);
+	}
+	m_serializationSizeInBytes = index;
+	return index;
 }

 int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
-  {
-	 int index=0;
-      
-      int numArguments = *(int*) &goldBuffer[index];
-      index+=sizeof(int);
+{
+	int index = 0;
+
+	int numArguments = *(int*)&goldBuffer[index];
+	index += sizeof(int);

 	if (numArguments != m_kernelArguments.size())
 	{
-		printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
+		printf("failed validation: expected %d arguments, found %d\n", numArguments, m_kernelArguments.size());
 		return -1;
 	}
-      
-      for (int ii=0;ii<numArguments;ii++)
-      {
-          b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
+
+	for (int ii = 0; ii < numArguments; ii++)
+	{
+		b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];

 		if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
 		{
-			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
+			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n", ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
 			return -2;
 		}

@@ -184,125 +180,117 @@ int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapac

 			if (expected != found)
 			{
-				printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
+				printf("failed validation: argument %d isBuffer expected: %d, found %d\n", ii, expected, found);
 				return -3;
 			}
 		}
-		index+=sizeof(b3KernelArgData);
+		index += sizeof(b3KernelArgData);

 		if (argGold->m_isBuffer)
-          {
-
-			unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
+		{
+			unsigned char* memBuf = (unsigned char*)malloc(m_kernelArguments[ii].m_argSizeInBytes);
 			unsigned char* goldBuf = &goldBuffer[index];
-			for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
+			for (int j = 0; j < m_kernelArguments[j].m_argSizeInBytes; j++)
 			{
 				memBuf[j] = 0xaa;
 			}

 			cl_int status = 0;
-			status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
-                                           memBuf, 0,0,0 );
-              b3Assert( status==CL_SUCCESS );
-              clFinish(m_commandQueue);
+			status = clEnqueueReadBuffer(m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
+										 memBuf, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
+			clFinish(m_commandQueue);

-			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
 			{
 				int expected = goldBuf[b];
 				int found = memBuf[b];
 				if (expected != found)
 				{
 					printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
-						ii, b, expected, found);
+						   ii, b, expected, found);
 					return -4;
 				}
 			}

-              
-              index+=argGold->m_argSizeInBytes;
-          } else 
-          {
-			
+			index += argGold->m_argSizeInBytes;
+		}
+		else
+		{
 			//compare content
-			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
 			{
 				int expected = argGold->m_argData[b];
-				int found =m_kernelArguments[ii].m_argData[b];
+				int found = m_kernelArguments[ii].m_argData[b];
 				if (expected != found)
 				{
 					printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
-						ii, b, expected, found);
+						   ii, b, expected, found);
 					return -5;
 				}
 			}
-
-          }
-      }
-      return index;
-
+		}
+	}
+	return index;
 }

 int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
 {
-//initialize to known values
-for (int i=0;i<destBufferCapacity;i++)
-	destBuffer[i] = 0xec;
+	//initialize to known values
+	for (int i = 0; i < destBufferCapacity; i++)
+		destBuffer[i] = 0xec;

-    assert(destBufferCapacity>=m_serializationSizeInBytes);
-    
-    //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc        
-    int numArguments = m_kernelArguments.size();
-    int curBufferSize = 0;
-    int* dest = (int*)&destBuffer[curBufferSize];
-    *dest = numArguments;
-    curBufferSize += sizeof(int);
-    
-    
-    
-    for (int i=0;i<this->m_kernelArguments.size();i++)
-    {
-        b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize];
-        *arg = m_kernelArguments[i];
-        curBufferSize+=sizeof(b3KernelArgData);
-        if (arg->m_isBuffer==1)
-        {
-            //copy the OpenCL buffer content
-            cl_int status = 0;
-            status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
-                                         &destBuffer[curBufferSize], 0,0,0 );
-            b3Assert( status==CL_SUCCESS );
-            clFinish(m_commandQueue);
-            curBufferSize+=arg->m_argSizeInBytes;
-        }
-        
-    }
-    return curBufferSize;
+	assert(destBufferCapacity >= m_serializationSizeInBytes);
+
+	//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
+	int numArguments = m_kernelArguments.size();
+	int curBufferSize = 0;
+	int* dest = (int*)&destBuffer[curBufferSize];
+	*dest = numArguments;
+	curBufferSize += sizeof(int);
+
+	for (int i = 0; i < this->m_kernelArguments.size(); i++)
+	{
+		b3KernelArgData* arg = (b3KernelArgData*)&destBuffer[curBufferSize];
+		*arg = m_kernelArguments[i];
+		curBufferSize += sizeof(b3KernelArgData);
+		if (arg->m_isBuffer == 1)
+		{
+			//copy the OpenCL buffer content
+			cl_int status = 0;
+			status = clEnqueueReadBuffer(m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
+										 &destBuffer[curBufferSize], 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
+			clFinish(m_commandQueue);
+			curBufferSize += arg->m_argSizeInBytes;
+		}
+	}
+	return curBufferSize;
 }

 void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems)
 {
 	int num = numWorkItems;
 	int buffSize = getSerializationBufferSize();
-	unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
-	for (int i=0;i<buffSize+1;i++)
+	unsigned char* buf = new unsigned char[buffSize + sizeof(int)];
+	for (int i = 0; i < buffSize + 1; i++)
 	{
 		unsigned char* ptr = (unsigned char*)&buf[i];
 		*ptr = 0xff;
 	}
-//	int actualWrite = serializeArguments(buf,buffSize);
-              
-//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
-//            printf("buf[buffSize] = %d\n",*cptr);
-              
-	assert(buf[buffSize]==0xff);//check for buffer overrun
+	//	int actualWrite = serializeArguments(buf,buffSize);
+
+	//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
+	//            printf("buf[buffSize] = %d\n",*cptr);
+
+	assert(buf[buffSize] == 0xff);  //check for buffer overrun
 	int* ptr = (int*)&buf[buffSize];
-              
+
 	*ptr = num;
-              
-	FILE* f = fopen(fileName,"wb");
-	fwrite(buf,buffSize+sizeof(int),1,f);
+
+	FILE* f = fopen(fileName, "wb");
+	fwrite(buf, buffSize + sizeof(int), 1, f);
 	fclose(f);

 	delete[] buf;
-}		
-
+}
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
@@ -9,60 +9,57 @@

 #define B3_DEBUG_SERIALIZE_CL

-
 #ifdef _WIN32
-#pragma warning(disable :4996)
+#pragma warning(disable : 4996)
 #endif
 #define B3_CL_MAX_ARG_SIZE 16
-B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3KernelArgData
 {
-    int m_isBuffer;
-    int m_argIndex;
-    int m_argSizeInBytes;
+	int m_isBuffer;
+	int m_argIndex;
+	int m_argSizeInBytes;
 	int m_unusedPadding;
-    union
-    {
-        cl_mem m_clBuffer;
-        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-    };
-    
+	union {
+		cl_mem m_clBuffer;
+		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+	};
 };

 class b3LauncherCL
 {
-
 	cl_command_queue m_commandQueue;
 	cl_kernel m_kernel;
 	int m_idx;
-	
-    b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
-    int m_serializationSizeInBytes;
-	bool	m_enableSerialization;
+
+	b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
+	int m_serializationSizeInBytes;
+	bool m_enableSerialization;

 	const char* m_name;
-	public:

-     b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays;
-    
-		b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
-    
-    virtual ~b3LauncherCL();
-    
-		void setBuffer( cl_mem clBuffer);
+public:
+	b3AlignedObjectArray<b3OpenCLArray<unsigned char>*> m_arrays;

-		void setBuffers( b3BufferInfoCL* buffInfo, int n );
-    
-    int getSerializationBufferSize() const 
-    {
-        return m_serializationSizeInBytes;
-    }
-    
-    int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
+	b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
+
+	virtual ~b3LauncherCL();
+
+	void setBuffer(cl_mem clBuffer);
+
+	void setBuffers(b3BufferInfoCL* buffInfo, int n);
+
+	int getSerializationBufferSize() const
+	{
+		return m_serializationSizeInBytes;
+	}
+
+	int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);

 	inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx);

-    int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
-    
+	int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
+
 	int getNumArguments() const
 	{
 		return m_kernelArguments.size();
@@ -75,61 +72,57 @@ class b3LauncherCL

 	void serializeToFile(const char* fileName, int numWorkItems);

-	template<typename T>
-		inline void setConst( const T& consts )
-		{
-			int sz=sizeof(T);
-			b3Assert(sz<=B3_CL_MAX_ARG_SIZE);
+	template <typename T>
+	inline void setConst(const T& consts)
+	{
+		int sz = sizeof(T);
+		b3Assert(sz <= B3_CL_MAX_ARG_SIZE);

-			if (m_enableSerialization)
-			{
-				b3KernelArgData kernelArg;
-				kernelArg.m_argIndex = m_idx;
-				kernelArg.m_isBuffer = 0;
-				T* destArg = (T*)kernelArg.m_argData;
-				*destArg = consts;
-				kernelArg.m_argSizeInBytes = sizeof(T);
-				m_kernelArguments.push_back(kernelArg);
-				m_serializationSizeInBytes+=sizeof(b3KernelArgData);
-			}
-            
-			cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
-			b3Assert( status == CL_SUCCESS );
+		if (m_enableSerialization)
+		{
+			b3KernelArgData kernelArg;
+			kernelArg.m_argIndex = m_idx;
+			kernelArg.m_isBuffer = 0;
+			T* destArg = (T*)kernelArg.m_argData;
+			*destArg = consts;
+			kernelArg.m_argSizeInBytes = sizeof(T);
+			m_kernelArguments.push_back(kernelArg);
+			m_serializationSizeInBytes += sizeof(b3KernelArgData);
 		}

-		inline void launch1D( int numThreads, int localSize = 64)
-		{
-			launch2D( numThreads, 1, localSize, 1 );
-		}
+		cl_int status = clSetKernelArg(m_kernel, m_idx++, sz, &consts);
+		b3Assert(status == CL_SUCCESS);
+	}

-		inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
-		{
-			size_t gRange[3] = {1,1,1};
-			size_t lRange[3] = {1,1,1};
-			lRange[0] = localSizeX;
-			lRange[1] = localSizeY;
-			gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
-			gRange[0] *= lRange[0];
-			gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
-			gRange[1] *= lRange[1];
+	inline void launch1D(int numThreads, int localSize = 64)
+	{
+		launch2D(numThreads, 1, localSize, 1);
+	}

-			cl_int status = clEnqueueNDRangeKernel( m_commandQueue, 
-				m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
-            if (status != CL_SUCCESS)
-            {
-                printf("Error: OpenCL status = %d\n",status);
-            }
-			b3Assert( status == CL_SUCCESS );
+	inline void launch2D(int numThreadsX, int numThreadsY, int localSizeX, int localSizeY)
+	{
+		size_t gRange[3] = {1, 1, 1};
+		size_t lRange[3] = {1, 1, 1};
+		lRange[0] = localSizeX;
+		lRange[1] = localSizeY;
+		gRange[0] = b3Max((size_t)1, (numThreadsX / lRange[0]) + (!(numThreadsX % lRange[0]) ? 0 : 1));
+		gRange[0] *= lRange[0];
+		gRange[1] = b3Max((size_t)1, (numThreadsY / lRange[1]) + (!(numThreadsY % lRange[1]) ? 0 : 1));
+		gRange[1] *= lRange[1];

-		}
-	
-		void	enableSerialization(bool serialize)
+		cl_int status = clEnqueueNDRangeKernel(m_commandQueue,
+											   m_kernel, 2, NULL, gRange, lRange, 0, 0, 0);
+		if (status != CL_SUCCESS)
 		{
-			m_enableSerialization = serialize;
+			printf("Error: OpenCL status = %d\n", status);
 		}
-		
+		b3Assert(status == CL_SUCCESS);
+	}
+
+	void enableSerialization(bool serialize)
+	{
+		m_enableSerialization = serialize;
+	}
 };

-
-
-#endif //B3_LAUNCHER_CL_H
+#endif  //B3_LAUNCHER_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
@@ -7,16 +7,16 @@
 template <typename T>
 class b3OpenCLArray
 {
-	size_t	m_size;
-	size_t	m_capacity;
-	cl_mem	m_clBuffer;
+	size_t m_size;
+	size_t m_capacity;
+	cl_mem m_clBuffer;

-	cl_context		 m_clContext;
+	cl_context m_clContext;
 	cl_command_queue m_commandQueue;

-	bool	m_ownsMemory;
+	bool m_ownsMemory;

-	bool	m_allowGrowingCapacity;
+	bool m_allowGrowingCapacity;

 	void deallocate()
 	{
@@ -25,22 +25,19 @@ class b3OpenCLArray
 			clReleaseMemObject(m_clBuffer);
 		}
 		m_clBuffer = 0;
-		m_capacity=0;
+		m_capacity = 0;
 	}

 	b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);

-	B3_FORCE_INLINE	size_t	allocSize(size_t size)
-		{
-			return (size ? size*2 : 1);
-		}
+	B3_FORCE_INLINE size_t allocSize(size_t size)
+	{
+		return (size ? size * 2 : 1);
+	}

 public:
-
-	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true)
-	:m_size(0),  m_capacity(0),m_clBuffer(0),
-	m_clContext(ctx),m_commandQueue(queue),
-	m_ownsMemory(true),m_allowGrowingCapacity(true)
+	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity = 0, bool allowGrowingCapacity = true)
+		: m_size(0), m_capacity(0), m_clBuffer(0), m_clContext(ctx), m_commandQueue(queue), m_ownsMemory(true), m_allowGrowingCapacity(true)
 	{
 		if (initialCapacity)
 		{
@@ -60,34 +57,32 @@ public:
 		m_capacity = sizeInElements;
 	}

-// we could enable this assignment, but need to make sure to avoid accidental deep copies
-//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
-//	{
-//		copyFromArray(src);
-//		return *this;
-//	}
+	// we could enable this assignment, but need to make sure to avoid accidental deep copies
+	//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
+	//	{
+	//		copyFromArray(src);
+	//		return *this;
+	//	}

-
-	cl_mem	getBufferCL() const
+	cl_mem getBufferCL() const
 	{
 		return m_clBuffer;
 	}

-
 	virtual ~b3OpenCLArray()
 	{
 		deallocate();
-		m_size=0;
-		m_capacity=0;
+		m_size = 0;
+		m_capacity = 0;
 	}

-	B3_FORCE_INLINE	bool push_back(const T& _Val,bool waitForCompletion=true)
+	B3_FORCE_INLINE bool push_back(const T& _Val, bool waitForCompletion = true)
 	{
 		bool result = true;
 		size_t sz = size();
-		if( sz == capacity() )
+		if (sz == capacity())
 		{
-			result = reserve( allocSize(size()) );
+			result = reserve(allocSize(size()));
 		}
 		copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
 		m_size++;
@@ -96,23 +91,23 @@ public:

 	B3_FORCE_INLINE T forcedAt(size_t n) const
 	{
-		b3Assert(n>=0);
-		b3Assert(n<capacity());
+		b3Assert(n >= 0);
+		b3Assert(n < capacity());
 		T elem;
-		copyToHostPointer(&elem,1,n,true);
+		copyToHostPointer(&elem, 1, n, true);
 		return elem;
 	}

 	B3_FORCE_INLINE T at(size_t n) const
 	{
-		b3Assert(n>=0);
-		b3Assert(n<size());
+		b3Assert(n >= 0);
+		b3Assert(n < size());
 		T elem;
-		copyToHostPointer(&elem,1,n,true);
+		copyToHostPointer(&elem, 1, n, true);
 		return elem;
 	}

-	B3_FORCE_INLINE	bool resize(size_t newsize, bool copyOldContents=true)
+	B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents = true)
 	{
 		bool result = true;
 		size_t curSize = size();
@@ -120,11 +115,12 @@ public:
 		if (newsize < curSize)
 		{
 			//leave the OpenCL memory for now
-		} else
+		}
+		else
 		{
 			if (newsize > size())
 			{
-				result = reserve(newsize,copyOldContents);
+				result = reserve(newsize, copyOldContents);
 			}

 			//leave new data uninitialized (init in debug mode?)
@@ -134,7 +130,8 @@ public:
 		if (result)
 		{
 			m_size = newsize;
-		} else
+		}
+		else
 		{
 			m_size = 0;
 		}
@@ -146,25 +143,25 @@ public:
 		return m_size;
 	}

-	B3_FORCE_INLINE	size_t capacity() const
+	B3_FORCE_INLINE size_t capacity() const
 	{
 		return m_capacity;
 	}

-	B3_FORCE_INLINE	bool reserve(size_t _Count, bool copyOldContents=true)
+	B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents = true)
 	{
-		bool result=true;
+		bool result = true;
 		// determine new minimum length of allocated storage
 		if (capacity() < _Count)
-		{	// not enough room, reallocate
+		{  // not enough room, reallocate

 			if (m_allowGrowingCapacity)
 			{
 				cl_int ciErrNum;
 				//create a new OpenCL buffer
-				size_t memSizeInBytes = sizeof(T)*_Count;
+				size_t memSizeInBytes = sizeof(T) * _Count;
 				cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
-				if (ciErrNum!=CL_SUCCESS)
+				if (ciErrNum != CL_SUCCESS)
 				{
 					b3Error("OpenCL out-of-memory\n");
 					_Count = 0;
@@ -173,13 +170,13 @@ public:
 //#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
 #ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
 				unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
-				for (size_t i=0;i<memSizeInBytes;i++)
+				for (size_t i = 0; i < memSizeInBytes; i++)
 					src[i] = 0xbb;
-				ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
-				b3Assert(ciErrNum==CL_SUCCESS);
+				ciErrNum = clEnqueueWriteBuffer(m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0, 0, 0);
+				b3Assert(ciErrNum == CL_SUCCESS);
 				clFinish(m_commandQueue);
 				free(src);
-#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
+#endif  //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS

 				if (result)
 				{
@@ -193,21 +190,21 @@ public:
 				m_clBuffer = buf;

 				m_capacity = _Count;
-			} else
+			}
+			else
 			{
 				//fail: assert and
 				b3Assert(0);
 				deallocate();
-				result=false;
+				result = false;
 			}
 		}
 		return result;
 	}

-
-	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const
+	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem = 0, size_t dstOffsetInElems = 0) const
 	{
-		if (numElements<=0)
+		if (numElements <= 0)
 			return;

 		b3Assert(m_clBuffer);
@@ -216,75 +213,74 @@ public:
 		//likely some error, destination is same as source
 		b3Assert(m_clBuffer != destination);

-		b3Assert((firstElem+numElements)<=m_size);
+		b3Assert((firstElem + numElements) <= m_size);

 		cl_int status = 0;

+		b3Assert(numElements > 0);
+		b3Assert(numElements <= m_size);

-		b3Assert(numElements>0);
-		b3Assert(numElements<=m_size);
+		size_t srcOffsetBytes = sizeof(T) * firstElem;
+		size_t dstOffsetInBytes = sizeof(T) * dstOffsetInElems;

-		size_t srcOffsetBytes = sizeof(T)*firstElem;
-		size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
+		status = clEnqueueCopyBuffer(m_commandQueue, m_clBuffer, destination,
+									 srcOffsetBytes, dstOffsetInBytes, sizeof(T) * numElements, 0, 0, 0);

-		status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
-			srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
-
-		b3Assert( status == CL_SUCCESS );
+		b3Assert(status == CL_SUCCESS);
 	}

-	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
+	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion = true)
 	{
 		size_t newSize = srcArray.size();

 		bool copyOldContents = false;
-		resize (newSize,copyOldContents);
+		resize(newSize, copyOldContents);
 		if (newSize)
-			copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
-
+			copyFromHostPointer(&srcArray[0], newSize, 0, waitForCompletion);
 	}

-	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true)
+	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem = 0, bool waitForCompletion = true)
 	{
-		b3Assert(numElems+destFirstElem <= capacity());
+		b3Assert(numElems + destFirstElem <= capacity());

-		if (numElems+destFirstElem)
+		if (numElems + destFirstElem)
 		{
 			cl_int status = 0;
-			size_t sizeInBytes=sizeof(T)*numElems;
-			status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
-			src, 0,0,0 );
-			b3Assert(status == CL_SUCCESS );
+			size_t sizeInBytes = sizeof(T) * numElems;
+			status = clEnqueueWriteBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * destFirstElem, sizeInBytes,
+										  src, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
 			if (waitForCompletion)
 				clFinish(m_commandQueue);
-		} else
+		}
+		else
 		{
 			b3Error("copyFromHostPointer invalid range\n");
 		}
 	}

-
-	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
+	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion = true) const
 	{
 		destArray.resize(this->size());
 		if (size())
-			copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
+			copyToHostPointer(&destArray[0], size(), 0, waitForCompletion);
 	}

-	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const
+	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem = 0, bool waitForCompletion = true) const
 	{
-		b3Assert(numElem+srcFirstElem <= capacity());
+		b3Assert(numElem + srcFirstElem <= capacity());

-		if(numElem+srcFirstElem <= capacity())
+		if (numElem + srcFirstElem <= capacity())
 		{
 			cl_int status = 0;
-			status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
-			destPtr, 0,0,0 );
-			b3Assert( status==CL_SUCCESS );
+			status = clEnqueueReadBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * srcFirstElem, sizeof(T) * numElem,
+										 destPtr, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);

 			if (waitForCompletion)
 				clFinish(m_commandQueue);
-		} else
+		}
+		else
 		{
 			b3Error("copyToHostPointer invalid range\n");
 		}
@@ -296,11 +292,9 @@ public:
 		resize(newSize);
 		if (size())
 		{
-			src.copyToCL(m_clBuffer,size());
+			src.copyToCL(m_clBuffer, size());
 		}
 	}
-
 };

-
-#endif //B3_OPENCL_ARRAY_H
+#endif  //B3_OPENCL_ARRAY_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
@@ -7,25 +7,24 @@
 #include "kernels/PrefixScanKernelsCL.h"

 b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* scanKernelSource = prefixScanKernelsCL;
 	cl_int pErrNum;
-	char* additionalMacros=0;
+	char* additionalMacros = 0;

-	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH);
+	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx, queue, size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_PROG_PATH);
 	b3Assert(scanProg);

-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_localScanKernel );
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_blockSumKernel );
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_propagationKernel );
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_localScanKernel);
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_blockSumKernel);
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_propagationKernel);
 }

-
 b3PrefixScanCL::~b3PrefixScanCL()
 {
 	delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanCL::~b3PrefixScanCL()
 	clReleaseKernel(m_propagationKernel);
 }

-template<class T>
+template <class T>
 T b3NextPowerOf2(T n)
 {
 	n -= 1;
-	for(int i=0; i<sizeof(T)*8; i++)
-		n = n | (n>>i);
-	return n+1;
+	for (int i = 0; i < sizeof(T) * 8; i++)
+		n = n | (n >> i);
+	return n + 1;
 }

 void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
 {
-	
-//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));

 	dst.resize(src.size());
 	m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<uns
 	b3Int4 constBuffer;
 	constBuffer.x = n;
 	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	constBuffer.z = (int)b3NextPowerOf2(numBlocks);

 	b3OpenCLArray<unsigned int>* srcNative = &src;
 	b3OpenCLArray<unsigned int>* dstNative = &dst;
-	
-	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };

-		b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst(  constBuffer );
-		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	{
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+
+		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
 	}

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};

-		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
 	}
-	

-	if( numBlocks > 1 )
+	if (numBlocks > 1)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
-		b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
 	}

-
-	if( sum )
+	if (sum)
 	{
 		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum,1,n-1,true);
+		dstNative->copyToHostPointer(sum, 1, n - 1, true);
 	}
-
 }

-
 void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
 {
 	unsigned int s = 0;
 	//if( data->m_option == EXCLUSIVE )
 	{
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
 			dst[i] = s;
 			s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3Alig
 	}
 	*/

-	if( sum )
+	if (sum)
 	{
-		*sum = dst[n-1];
+		*sum = dst[n - 1];
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
@@ -13,9 +13,9 @@ class b3PrefixScanCL
 		BLOCK_SIZE = 128
 	};

-//	Option m_option;
+	//	Option m_option;

-	cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;

 	cl_kernel m_localScanKernel;
 	cl_kernel m_blockSumKernel;
@@ -23,15 +23,13 @@ class b3PrefixScanCL

 	b3OpenCLArray<unsigned int>* m_workBuffer;

-
-	public:
-		
-	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+public:
+	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);

 	virtual ~b3PrefixScanCL();

 	void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
-	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0);
+	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum = 0);
 };

-#endif //B3_PREFIX_SCAN_CL_H
+#endif  //B3_PREFIX_SCAN_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
@@ -7,25 +7,24 @@
 #include "kernels/PrefixScanKernelsFloat4CL.h"

 b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* scanKernelSource = prefixScanKernelsFloat4CL;
 	cl_int pErrNum;
-	char* additionalMacros=0;
+	char* additionalMacros = 0;

-	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
+	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx, queue, size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
 	b3Assert(scanProg);

-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_localScanKernel );
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_blockSumKernel );
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_propagationKernel );
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_localScanKernel);
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_blockSumKernel);
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_propagationKernel);
 }

-
 b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
 {
 	delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
 	clReleaseKernel(m_propagationKernel);
 }

-template<class T>
+template <class T>
 T b3NextPowerOf2(T n)
 {
 	n -= 1;
-	for(int i=0; i<sizeof(T)*8; i++)
-		n = n | (n>>i);
-	return n+1;
+	for (int i = 0; i < sizeof(T) * 8; i++)
+		n = n | (n >> i);
+	return n + 1;
 }

 void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
 {
-	
-//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));

 	dst.resize(src.size());
 	m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<
 	b3Int4 constBuffer;
 	constBuffer.x = n;
 	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	constBuffer.z = (int)b3NextPowerOf2(numBlocks);

 	b3OpenCLArray<b3Vector3>* srcNative = &src;
 	b3OpenCLArray<b3Vector3>* dstNative = &dst;
-	
-	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };

-		b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst(  constBuffer );
-		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	{
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+
+		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
 	}

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};

-		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
 	}
-	

-	if( numBlocks > 1 )
+	if (numBlocks > 1)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
-		b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
 	}

-
-	if( sum )
+	if (sum)
 	{
 		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum,1,n-1,true);
+		dstNative->copyToHostPointer(sum, 1, n - 1, true);
 	}
-
 }

-
 void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
 {
-	b3Vector3 s=b3MakeVector3(0,0,0);
+	b3Vector3 s = b3MakeVector3(0, 0, 0);
 	//if( data->m_option == EXCLUSIVE )
 	{
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
 			dst[i] = s;
 			s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3A
 	}
 	*/

-	if( sum )
+	if (sum)
 	{
-		*sum = dst[n-1];
+		*sum = dst[n - 1];
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
@@ -14,9 +14,9 @@ class b3PrefixScanFloat4CL
 		BLOCK_SIZE = 128
 	};

-//	Option m_option;
+	//	Option m_option;

-	cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;

 	cl_kernel m_localScanKernel;
 	cl_kernel m_blockSumKernel;
@@ -24,10 +24,8 @@ class b3PrefixScanFloat4CL

 	b3OpenCLArray<b3Vector3>* m_workBuffer;

-
-	public:
-		
-	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+public:
+	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);

 	virtual ~b3PrefixScanFloat4CL();

@@ -35,4 +33,4 @@ class b3PrefixScanFloat4CL
 	void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
 };

-#endif //B3_PREFIX_SCAN_CL_H
+#endif  //B3_PREFIX_SCAN_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
@@ -6,90 +6,79 @@

 struct b3SortData
 {
-	union
-	{
+	union {
 		unsigned int m_key;
 		unsigned int x;
 	};

-	union
-	{
+	union {
 		unsigned int m_value;
 		unsigned int y;
-		
 	};
 };
 #include "b3BufferInfoCL.h"

-class  b3RadixSort32CL
+class b3RadixSort32CL
 {
+	b3OpenCLArray<unsigned int>* m_workBuffer1;
+	b3OpenCLArray<unsigned int>* m_workBuffer2;

-		b3OpenCLArray<unsigned int>* m_workBuffer1;
-		b3OpenCLArray<unsigned int>* m_workBuffer2;
-		
-		b3OpenCLArray<b3SortData>*	m_workBuffer3;
-		b3OpenCLArray<b3SortData>*	m_workBuffer4;
+	b3OpenCLArray<b3SortData>* m_workBuffer3;
+	b3OpenCLArray<b3SortData>* m_workBuffer4;

-		b3OpenCLArray<unsigned int>* m_workBuffer3a;
-		b3OpenCLArray<unsigned int>* m_workBuffer4a;
+	b3OpenCLArray<unsigned int>* m_workBuffer3a;
+	b3OpenCLArray<unsigned int>* m_workBuffer4a;

-		cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;

-		cl_kernel m_streamCountSortDataKernel;
-		cl_kernel m_streamCountKernel;
+	cl_kernel m_streamCountSortDataKernel;
+	cl_kernel m_streamCountKernel;

-		cl_kernel m_prefixScanKernel;
-		cl_kernel m_sortAndScatterSortDataKernel;
-		cl_kernel m_sortAndScatterKernel;
+	cl_kernel m_prefixScanKernel;
+	cl_kernel m_sortAndScatterSortDataKernel;
+	cl_kernel m_sortAndScatterKernel;

+	bool m_deviceCPU;

-		bool	m_deviceCPU;
-
-		class b3PrefixScanCL* m_scan;
-		class b3FillCL*	m_fill;
+	class b3PrefixScanCL* m_scan;
+	class b3FillCL* m_fill;

 public:
 	struct b3ConstData
-		{
-			int m_n;
-			int m_nWGs;
-			int m_startBit;
-			int m_nBlocksPerWG;
-		};
+	{
+		int m_n;
+		int m_nWGs;
+		int m_startBit;
+		int m_nBlocksPerWG;
+	};
 	enum
-		{
-			DATA_ALIGNMENT = 256,
-			WG_SIZE = 64,
-            BLOCK_SIZE = 256,
-			ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
-			BITS_PER_PASS = 4,
-			NUM_BUCKET=(1<<BITS_PER_PASS),
-			//	if you change this, change nPerWI in kernel as well
-			NUM_WGS = 20*6,	//	cypress
-//			NUM_WGS = 24*6,	//	cayman
-//			NUM_WGS = 32*4,	//	nv
-		};
-
+	{
+		DATA_ALIGNMENT = 256,
+		WG_SIZE = 64,
+		BLOCK_SIZE = 256,
+		ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
+		BITS_PER_PASS = 4,
+		NUM_BUCKET = (1 << BITS_PER_PASS),
+		//	if you change this, change nPerWI in kernel as well
+		NUM_WGS = 20 * 6,  //	cypress
+						   //			NUM_WGS = 24*6,	//	cayman
+						   //			NUM_WGS = 32*4,	//	nv
+	};

 private:
-		
-
 public:
+	b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);

-		b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
+	virtual ~b3RadixSort32CL();

-		virtual ~b3RadixSort32CL();
+	void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
+				 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);

-		void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
-								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
-
-		///keys only
-		void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits  = 32 );
-
-		void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits  = 32 );
-		void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
-		void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	///keys only
+	void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);

+	void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
 };
-#endif //B3_RADIXSORT32_H
-
+#endif  //B3_RADIXSORT32_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
@@ -1,87 +1,86 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* boundSearchKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"typedef struct\n"
-"{\n"
-"	u32 m_key; \n"
-"	u32 m_value;\n"
-"}SortData;\n"
-"typedef struct\n"
-"{\n"
-"	u32 m_nSrc;\n"
-"	u32 m_nDst;\n"
-"	u32 m_padding[2];\n"
-"} ConstBuffer;\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < nSrc )\n"
-"	{\n"
-"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
-"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
-"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"		if( iData.m_key != jData.m_key )\n"
-"		{\n"
-"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
-"			u32 k = jData.m_key;\n"
-"			{\n"
-"				dst[k] = gIdx;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX+1;\n"
-"	if( gIdx < nSrc+1 )\n"
-"	{\n"
-"		SortData first; first.m_key = 0; first.m_value = 0;\n"
-"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"		SortData iData = src[gIdx-1];\n"
-"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"		if( iData.m_key != jData.m_key )\n"
-"		{\n"
-"			u32 k = iData.m_key;\n"
-"			{\n"
-"				dst[k] = gIdx;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	\n"
-"	if( gIdx < nDst )\n"
-"	{\n"
-"		C[gIdx] = A[gIdx] - B[gIdx];\n"
-"	}\n"
-"}\n"
-;
+static const char* boundSearchKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_key; \n"
+	"	u32 m_value;\n"
+	"}SortData;\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_nSrc;\n"
+	"	u32 m_nDst;\n"
+	"	u32 m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < nSrc )\n"
+	"	{\n"
+	"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+	"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+	"		if( iData.m_key != jData.m_key )\n"
+	"		{\n"
+	"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+	"			u32 k = jData.m_key;\n"
+	"			{\n"
+	"				dst[k] = gIdx;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX+1;\n"
+	"	if( gIdx < nSrc+1 )\n"
+	"	{\n"
+	"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+	"		SortData iData = src[gIdx-1];\n"
+	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+	"		if( iData.m_key != jData.m_key )\n"
+	"		{\n"
+	"			u32 k = iData.m_key;\n"
+	"			{\n"
+	"				dst[k] = gIdx;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	\n"
+	"	if( gIdx < nDst )\n"
+	"	{\n"
+	"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
@@ -1,132 +1,131 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* copyKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"\n"
-"#define make_uint4 (uint4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	int m_n;\n"
-"	int m_padding[3];\n"
-"} ConstBuffer;\n"
-"\n"
-"\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float4 a0 = src[gIdx];\n"
-"\n"
-"		dst[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( 2*gIdx <= cb.m_n )\n"
-"	{\n"
-"		float4 a0 = src[gIdx*2+0];\n"
-"		float4 a1 = src[gIdx*2+1];\n"
-"\n"
-"		dst[ gIdx*2+0 ] = a0;\n"
-"		dst[ gIdx*2+1 ] = a1;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( 4*gIdx <= cb.m_n )\n"
-"	{\n"
-"		int idx0 = gIdx*4+0;\n"
-"		int idx1 = gIdx*4+1;\n"
-"		int idx2 = gIdx*4+2;\n"
-"		int idx3 = gIdx*4+3;\n"
-"\n"
-"		float4 a0 = src[idx0];\n"
-"		float4 a1 = src[idx1];\n"
-"		float4 a2 = src[idx2];\n"
-"		float4 a3 = src[idx3];\n"
-"\n"
-"		dst[ idx0 ] = a0;\n"
-"		dst[ idx1 ] = a1;\n"
-"		dst[ idx2 ] = a2;\n"
-"		dst[ idx3 ] = a3;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float a0 = srcF1[gIdx];\n"
-"\n"
-"		dstF1[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float2 a0 = srcF2[gIdx];\n"
-"\n"
-"		dstF2[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"\n"
-;
+static const char* copyKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"\n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_n;\n"
+	"	int m_padding[3];\n"
+	"} ConstBuffer;\n"
+	"\n"
+	"\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float4 a0 = src[gIdx];\n"
+	"\n"
+	"		dst[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( 2*gIdx <= cb.m_n )\n"
+	"	{\n"
+	"		float4 a0 = src[gIdx*2+0];\n"
+	"		float4 a1 = src[gIdx*2+1];\n"
+	"\n"
+	"		dst[ gIdx*2+0 ] = a0;\n"
+	"		dst[ gIdx*2+1 ] = a1;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( 4*gIdx <= cb.m_n )\n"
+	"	{\n"
+	"		int idx0 = gIdx*4+0;\n"
+	"		int idx1 = gIdx*4+1;\n"
+	"		int idx2 = gIdx*4+2;\n"
+	"		int idx3 = gIdx*4+3;\n"
+	"\n"
+	"		float4 a0 = src[idx0];\n"
+	"		float4 a1 = src[idx1];\n"
+	"		float4 a2 = src[idx2];\n"
+	"		float4 a3 = src[idx3];\n"
+	"\n"
+	"		dst[ idx0 ] = a0;\n"
+	"		dst[ idx1 ] = a1;\n"
+	"		dst[ idx2 ] = a2;\n"
+	"		dst[ idx3 ] = a3;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float a0 = srcF1[gIdx];\n"
+	"\n"
+	"		dstF1[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float2 a0 = srcF2[gIdx];\n"
+	"\n"
+	"		dstF2[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
@@ -1,91 +1,90 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* fillKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define make_uint4 (uint4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"typedef struct\n"
-"{\n"
-"	union\n"
-"	{\n"
-"		int4 m_data;\n"
-"		uint4 m_unsignedData;\n"
-"		float	m_floatData;\n"
-"	};\n"
-"	int m_offset;\n"
-"	int m_n;\n"
-"	int m_padding[2];\n"
-"} ConstBuffer;\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num_elements )\n"
-"	{\n"
-"		dstInt[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num_elements )\n"
-"	{\n"
-"		dstFloat[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt4[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-;
+static const char* fillKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"typedef struct\n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		int4 m_data;\n"
+	"		uint4 m_unsignedData;\n"
+	"		float	m_floatData;\n"
+	"	};\n"
+	"	int m_offset;\n"
+	"	int m_n;\n"
+	"	int m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num_elements )\n"
+	"	{\n"
+	"		dstInt[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num_elements )\n"
+	"	{\n"
+	"		dstFloat[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt4[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
@@ -1,129 +1,128 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"// takahiro end\n"
-"#define WG_SIZE 128 \n"
-"#define m_numElems x\n"
-"#define m_numBlocks y\n"
-"#define m_numScanBlocks z\n"
-"/*typedef struct\n"
-"{\n"
-"	uint m_numElems;\n"
-"	uint m_numBlocks;\n"
-"	uint m_numScanBlocks;\n"
-"	uint m_padding[1];\n"
-"} ConstBuffer;\n"
-"*/\n"
-"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
-"{\n"
-"	u32 blocksum;\n"
-"    int offset = 1;\n"
-"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            data[bi] += data[ai];\n"
-"        }\n"
-"	}\n"
-"    GROUP_LDS_BARRIER;\n"
-"    if( lIdx == 0 )\n"
-"	{\n"
-"		blocksum = data[ n-1 ];\n"
-"        data[ n-1 ] = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	offset >>= 1;\n"
-"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            u32 temp = data[ai];\n"
-"            data[ai] = data[bi];\n"
-"            data[bi] += temp;\n"
-"        }\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	return blocksum;\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
-"		uint4 cb)\n"
-"{\n"
-"	__local u32 ldsData[WG_SIZE*2];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
-"	if( (2*gIdx) < cb.m_numElems )\n"
-"    {\n"
-"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-"	}\n"
-"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-"	{\n"
-"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-"    }\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
-"{\n"
-"	const u32 blockSize = WG_SIZE*2;\n"
-"	int myIdx = GET_GROUP_IDX+1;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	u32 iBlockSum = blockSum[myIdx];\n"
-"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-"	{\n"
-"		dst[i] += iBlockSum;\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
-"{\n"
-"	__local u32 ldsData[2048];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int lSize = GET_GROUP_SIZE;\n"
-"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-"	{\n"
-"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-"	{\n"
-"		dst[i] = ldsData[i];\n"
-"	}\n"
-"	if( gIdx == 0 )\n"
-"	{\n"
-"		dst[cb.m_numBlocks] = sum;\n"
-"	}\n"
-"}\n"
-;
+static const char* prefixScanKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"// takahiro end\n"
+	"#define WG_SIZE 128 \n"
+	"#define m_numElems x\n"
+	"#define m_numBlocks y\n"
+	"#define m_numScanBlocks z\n"
+	"/*typedef struct\n"
+	"{\n"
+	"	uint m_numElems;\n"
+	"	uint m_numBlocks;\n"
+	"	uint m_numScanBlocks;\n"
+	"	uint m_padding[1];\n"
+	"} ConstBuffer;\n"
+	"*/\n"
+	"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
+	"{\n"
+	"	u32 blocksum;\n"
+	"    int offset = 1;\n"
+	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            data[bi] += data[ai];\n"
+	"        }\n"
+	"	}\n"
+	"    GROUP_LDS_BARRIER;\n"
+	"    if( lIdx == 0 )\n"
+	"	{\n"
+	"		blocksum = data[ n-1 ];\n"
+	"        data[ n-1 ] = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	offset >>= 1;\n"
+	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            u32 temp = data[ai];\n"
+	"            data[ai] = data[bi];\n"
+	"            data[bi] += temp;\n"
+	"        }\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	return blocksum;\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
+	"		uint4 cb)\n"
+	"{\n"
+	"	__local u32 ldsData[WG_SIZE*2];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+	"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+	"	if( (2*gIdx) < cb.m_numElems )\n"
+	"    {\n"
+	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+	"	}\n"
+	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+	"	{\n"
+	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+	"    }\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
+	"{\n"
+	"	const u32 blockSize = WG_SIZE*2;\n"
+	"	int myIdx = GET_GROUP_IDX+1;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	u32 iBlockSum = blockSum[myIdx];\n"
+	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+	"	{\n"
+	"		dst[i] += iBlockSum;\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
+	"{\n"
+	"	__local u32 ldsData[2048];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int lSize = GET_GROUP_SIZE;\n"
+	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+	"	{\n"
+	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+	"	{\n"
+	"		dst[i] = ldsData[i];\n"
+	"	}\n"
+	"	if( gIdx == 0 )\n"
+	"	{\n"
+	"		dst[cb.m_numBlocks] = sum;\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
@@ -1,129 +1,128 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsFloat4CL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"// takahiro end\n"
-"#define WG_SIZE 128 \n"
-"#define m_numElems x\n"
-"#define m_numBlocks y\n"
-"#define m_numScanBlocks z\n"
-"/*typedef struct\n"
-"{\n"
-"	uint m_numElems;\n"
-"	uint m_numBlocks;\n"
-"	uint m_numScanBlocks;\n"
-"	uint m_padding[1];\n"
-"} ConstBuffer;\n"
-"*/\n"
-"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
-"{\n"
-"	float4 blocksum;\n"
-"    int offset = 1;\n"
-"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            data[bi] += data[ai];\n"
-"        }\n"
-"	}\n"
-"    GROUP_LDS_BARRIER;\n"
-"    if( lIdx == 0 )\n"
-"	{\n"
-"		blocksum = data[ n-1 ];\n"
-"    data[ n-1 ] = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	offset >>= 1;\n"
-"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            float4 temp = data[ai];\n"
-"            data[ai] = data[bi];\n"
-"            data[bi] += temp;\n"
-"        }\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	return blocksum;\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
-"{\n"
-"	__local float4 ldsData[WG_SIZE*2];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	if( lIdx == 0 ) \n"
-"		sumBuffer[GET_GROUP_IDX] = sum;\n"
-"	if( (2*gIdx) < cb.m_numElems )\n"
-"    {\n"
-"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-"	}\n"
-"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-"	{\n"
-"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-"    }\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
-"{\n"
-"	const u32 blockSize = WG_SIZE*2;\n"
-"	int myIdx = GET_GROUP_IDX+1;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	float4 iBlockSum = blockSum[myIdx];\n"
-"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-"	{\n"
-"		dst[i] += iBlockSum;\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
-"{\n"
-"	__local float4 ldsData[2048];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int lSize = GET_GROUP_SIZE;\n"
-"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-"	{\n"
-"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-"	{\n"
-"		dst[i] = ldsData[i];\n"
-"	}\n"
-"	if( gIdx == 0 )\n"
-"	{\n"
-"		dst[cb.m_numBlocks] = sum;\n"
-"	}\n"
-"}\n"
-;
+static const char* prefixScanKernelsFloat4CL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"// takahiro end\n"
+	"#define WG_SIZE 128 \n"
+	"#define m_numElems x\n"
+	"#define m_numBlocks y\n"
+	"#define m_numScanBlocks z\n"
+	"/*typedef struct\n"
+	"{\n"
+	"	uint m_numElems;\n"
+	"	uint m_numBlocks;\n"
+	"	uint m_numScanBlocks;\n"
+	"	uint m_padding[1];\n"
+	"} ConstBuffer;\n"
+	"*/\n"
+	"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
+	"{\n"
+	"	float4 blocksum;\n"
+	"    int offset = 1;\n"
+	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            data[bi] += data[ai];\n"
+	"        }\n"
+	"	}\n"
+	"    GROUP_LDS_BARRIER;\n"
+	"    if( lIdx == 0 )\n"
+	"	{\n"
+	"		blocksum = data[ n-1 ];\n"
+	"    data[ n-1 ] = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	offset >>= 1;\n"
+	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            float4 temp = data[ai];\n"
+	"            data[ai] = data[bi];\n"
+	"            data[bi] += temp;\n"
+	"        }\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	return blocksum;\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
+	"{\n"
+	"	__local float4 ldsData[WG_SIZE*2];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+	"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	if( lIdx == 0 ) \n"
+	"		sumBuffer[GET_GROUP_IDX] = sum;\n"
+	"	if( (2*gIdx) < cb.m_numElems )\n"
+	"    {\n"
+	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+	"	}\n"
+	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+	"	{\n"
+	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+	"    }\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
+	"{\n"
+	"	const u32 blockSize = WG_SIZE*2;\n"
+	"	int myIdx = GET_GROUP_IDX+1;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	float4 iBlockSum = blockSum[myIdx];\n"
+	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+	"	{\n"
+	"		dst[i] += iBlockSum;\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
+	"{\n"
+	"	__local float4 ldsData[2048];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int lSize = GET_GROUP_SIZE;\n"
+	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+	"	{\n"
+	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+	"	{\n"
+	"		dst[i] = ldsData[i];\n"
+	"	}\n"
+	"	if( gIdx == 0 )\n"
+	"	{\n"
+	"		dst[cb.m_numBlocks] = sum;\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
--- a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
+++ b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.cpp
@@ -4,7 +4,6 @@
 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
 #include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"

-
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
@@ -15,38 +14,35 @@

 #include "Bullet3OpenCL/Raycast/kernels/rayCastKernels.h"

-
 #define B3_RAYCAST_PATH "src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.cl"

-
-
 struct b3GpuRaycastInternalData
 {
 	cl_context m_context;
 	cl_device_id m_device;
-	cl_command_queue  m_q;
+	cl_command_queue m_q;
 	cl_kernel m_raytraceKernel;
 	cl_kernel m_raytracePairsKernel;
 	cl_kernel m_findRayRigidPairIndexRanges;
-	
+
 	b3GpuParallelLinearBvh* m_plbvh;
 	b3RadixSort32CL* m_radixSorter;
 	b3FillCL* m_fill;
-	
+
 	//1 element per ray
 	b3OpenCLArray<b3RayInfo>* m_gpuRays;
 	b3OpenCLArray<b3RayHit>* m_gpuHitResults;
 	b3OpenCLArray<int>* m_firstRayRigidPairIndexPerRay;
 	b3OpenCLArray<int>* m_numRayRigidPairsPerRay;
-	
+
 	//1 element per (ray index, rigid index) pair, where the ray intersects with the rigid's AABB
 	b3OpenCLArray<int>* m_gpuNumRayRigidPairs;
-	b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs;	//x == ray index, y == rigid index
-	
+	b3OpenCLArray<b3Int2>* m_gpuRayRigidPairs;  //x == ray index, y == rigid index
+
 	int m_test;
 };

-b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue  q)
+b3GpuRaycast::b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q)
 {
 	m_data = new b3GpuRaycastInternalData;
 	m_data->m_context = ctx;
@@ -59,7 +55,7 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
 	m_data->m_plbvh = new b3GpuParallelLinearBvh(ctx, device, q);
 	m_data->m_radixSorter = new b3RadixSort32CL(ctx, device, q);
 	m_data->m_fill = new b3FillCL(ctx, device, q);
-	
+
 	m_data->m_gpuRays = new b3OpenCLArray<b3RayInfo>(ctx, q);
 	m_data->m_gpuHitResults = new b3OpenCLArray<b3RayHit>(ctx, q);
 	m_data->m_firstRayRigidPairIndexPerRay = new b3OpenCLArray<int>(ctx, q);
@@ -68,19 +64,17 @@ b3GpuRaycast::b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue
 	m_data->m_gpuRayRigidPairs = new b3OpenCLArray<b3Int2>(ctx, q);

 	{
-		cl_int errNum=0;
-		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,rayCastKernelCL,&errNum,"",B3_RAYCAST_PATH);
-		b3Assert(errNum==CL_SUCCESS);
-		m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastKernel",&errNum,prog);
-		b3Assert(errNum==CL_SUCCESS);
-		m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "rayCastPairsKernel",&errNum,prog);
-		b3Assert(errNum==CL_SUCCESS);
-		m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,rayCastKernelCL, "findRayRigidPairIndexRanges",&errNum,prog);
-		b3Assert(errNum==CL_SUCCESS);
+		cl_int errNum = 0;
+		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, &errNum, "", B3_RAYCAST_PATH);
+		b3Assert(errNum == CL_SUCCESS);
+		m_data->m_raytraceKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastKernel", &errNum, prog);
+		b3Assert(errNum == CL_SUCCESS);
+		m_data->m_raytracePairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "rayCastPairsKernel", &errNum, prog);
+		b3Assert(errNum == CL_SUCCESS);
+		m_data->m_findRayRigidPairIndexRanges = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, rayCastKernelCL, "findRayRigidPairIndexRanges", &errNum, prog);
+		b3Assert(errNum == CL_SUCCESS);
 		clReleaseProgram(prog);
 	}
-
-
 }

 b3GpuRaycast::~b3GpuRaycast()
@@ -88,78 +82,80 @@ b3GpuRaycast::~b3GpuRaycast()
 	clReleaseKernel(m_data->m_raytraceKernel);
 	clReleaseKernel(m_data->m_raytracePairsKernel);
 	clReleaseKernel(m_data->m_findRayRigidPairIndexRanges);
-	
+
 	delete m_data->m_plbvh;
 	delete m_data->m_radixSorter;
 	delete m_data->m_fill;
-	
+
 	delete m_data->m_gpuRays;
 	delete m_data->m_gpuHitResults;
 	delete m_data->m_firstRayRigidPairIndexPerRay;
 	delete m_data->m_numRayRigidPairsPerRay;
 	delete m_data->m_gpuNumRayRigidPairs;
 	delete m_data->m_gpuRayRigidPairs;
-	
+
 	delete m_data;
 }

-bool sphere_intersect(const b3Vector3& spherePos,  b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction)
+bool sphere_intersect(const b3Vector3& spherePos, b3Scalar radius, const b3Vector3& rayFrom, const b3Vector3& rayTo, float& hitFraction)
 {
-    b3Vector3 rs = rayFrom - spherePos;
-	b3Vector3 rayDir = rayTo-rayFrom;
-	
-	float A = b3Dot(rayDir,rayDir);
-    float B = b3Dot(rs, rayDir);
-    float C = b3Dot(rs, rs) - (radius * radius);
-    
-	float D = B * B - A*C;
+	b3Vector3 rs = rayFrom - spherePos;
+	b3Vector3 rayDir = rayTo - rayFrom;

-    if (D > 0.0)
-    {
-        float t = (-B - sqrt(D))/A;
+	float A = b3Dot(rayDir, rayDir);
+	float B = b3Dot(rs, rayDir);
+	float C = b3Dot(rs, rs) - (radius * radius);

-        if ( (t >= 0.0f) && (t < hitFraction) )
-        {
+	float D = B * B - A * C;
+
+	if (D > 0.0)
+	{
+		float t = (-B - sqrt(D)) / A;
+
+		if ((t >= 0.0f) && (t < hitFraction))
+		{
 			hitFraction = t;
-            return true;
+			return true;
 		}
 	}
 	return false;
 }

 bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const b3ConvexPolyhedronData& poly,
-	const b3AlignedObjectArray<b3GpuFace>& faces,  float& hitFraction, b3Vector3& hitNormal)
+			   const b3AlignedObjectArray<b3GpuFace>& faces, float& hitFraction, b3Vector3& hitNormal)
 {
 	float exitFraction = hitFraction;
 	float enterFraction = -0.1f;
-	b3Vector3 curHitNormal=b3MakeVector3(0,0,0);
-	for (int i=0;i<poly.m_numFaces;i++)
+	b3Vector3 curHitNormal = b3MakeVector3(0, 0, 0);
+	for (int i = 0; i < poly.m_numFaces; i++)
 	{
-		const b3GpuFace& face = faces[poly.m_faceOffset+i];
-		float fromPlaneDist = b3Dot(rayFromLocal,face.m_plane)+face.m_plane.w;
-		float toPlaneDist = b3Dot(rayToLocal,face.m_plane)+face.m_plane.w;
-		if (fromPlaneDist<0.f)
+		const b3GpuFace& face = faces[poly.m_faceOffset + i];
+		float fromPlaneDist = b3Dot(rayFromLocal, face.m_plane) + face.m_plane.w;
+		float toPlaneDist = b3Dot(rayToLocal, face.m_plane) + face.m_plane.w;
+		if (fromPlaneDist < 0.f)
 		{
 			if (toPlaneDist >= 0.f)
 			{
-				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
-				if (exitFraction>fraction)
+				float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
+				if (exitFraction > fraction)
 				{
 					exitFraction = fraction;
 				}
-			} 			
-		} else
+			}
+		}
+		else
 		{
-			if (toPlaneDist<0.f)
+			if (toPlaneDist < 0.f)
 			{
-				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);
+				float fraction = fromPlaneDist / (fromPlaneDist - toPlaneDist);
 				if (enterFraction <= fraction)
 				{
 					enterFraction = fraction;
 					curHitNormal = face.m_plane;
 					curHitNormal.w = 0.f;
 				}
-			} else
+			}
+			else
 			{
 				return false;
 			}
@@ -176,44 +172,41 @@ bool rayConvex(const b3Vector3& rayFromLocal, const b3Vector3& rayToLocal, const
 	return true;
 }

-void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
-		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables,const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
+void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
+								int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, const struct b3GpuNarrowPhaseInternalData* narrowphaseData)
 {
-
-//	return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables);
+	//	return castRays(rays,hitResults,numBodies,bodies,numCollidables,collidables);

 	B3_PROFILE("castRaysHost");
-	for (int r=0;r<rays.size();r++)
+	for (int r = 0; r < rays.size(); r++)
 	{
 		b3Vector3 rayFrom = rays[r].m_from;
 		b3Vector3 rayTo = rays[r].m_to;
 		float hitFraction = hitResults[r].m_hitFraction;

-		int hitBodyIndex= -1;
+		int hitBodyIndex = -1;
 		b3Vector3 hitNormal;

-		for (int b=0;b<numBodies;b++)
+		for (int b = 0; b < numBodies; b++)
 		{
-				
 			const b3Vector3& pos = bodies[b].m_pos;
 			//const b3Quaternion& orn = bodies[b].m_quat;
-			
+
 			switch (collidables[bodies[b].m_collidableIdx].m_shapeType)
 			{
-			case SHAPE_SPHERE:
+				case SHAPE_SPHERE:
 				{
 					b3Scalar radius = collidables[bodies[b].m_collidableIdx].m_radius;
-					if (sphere_intersect(pos,  radius, rayFrom, rayTo,hitFraction))
+					if (sphere_intersect(pos, radius, rayFrom, rayTo, hitFraction))
 					{
 						hitBodyIndex = b;
 						b3Vector3 hitPoint;
-						hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction);
-						hitNormal = (hitPoint-bodies[b].m_pos).normalize();
+						hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
+						hitNormal = (hitPoint - bodies[b].m_pos).normalize();
 					}
 				}
-			case SHAPE_CONVEX_HULL:
+				case SHAPE_CONVEX_HULL:
 				{
-
 					b3Transform convexWorldTransform;
 					convexWorldTransform.setIdentity();
 					convexWorldTransform.setOrigin(bodies[b].m_pos);
@@ -222,72 +215,67 @@ void b3GpuRaycast::castRaysHost(const b3AlignedObjectArray<b3RayInfo>& rays,	b3A

 					b3Vector3 rayFromLocal = convexWorld2Local(rayFrom);
 					b3Vector3 rayToLocal = convexWorld2Local(rayTo);
-					
-					
+
 					int shapeIndex = collidables[bodies[b].m_collidableIdx].m_shapeIndex;
 					const b3ConvexPolyhedronData& poly = narrowphaseData->m_convexPolyhedra[shapeIndex];
-					if (rayConvex(rayFromLocal, rayToLocal,poly,narrowphaseData->m_convexFaces, hitFraction, hitNormal))
+					if (rayConvex(rayFromLocal, rayToLocal, poly, narrowphaseData->m_convexFaces, hitFraction, hitNormal))
 					{
 						hitBodyIndex = b;
 					}

-					
 					break;
 				}
-			default:
+				default:
 				{
-					static bool once=true;
+					static bool once = true;
 					if (once)
 					{
-						once=false;
+						once = false;
 						b3Warning("Raytest: unsupported shape type\n");
 					}
 				}
 			}
 		}
-		if (hitBodyIndex>=0)
+		if (hitBodyIndex >= 0)
 		{
-
 			hitResults[r].m_hitFraction = hitFraction;
-			hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to,hitFraction);
+			hitResults[r].m_hitPoint.setInterpolate3(rays[r].m_from, rays[r].m_to, hitFraction);
 			hitResults[r].m_hitNormal = hitNormal;
 			hitResults[r].m_hitBody = hitBodyIndex;
 		}
-
 	}
 }
 ///todo: add some acceleration structure (AABBs, tree etc)
-void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
-		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables, 
-		const struct b3GpuNarrowPhaseInternalData* narrowphaseData,	class b3GpuBroadphaseInterface* broadphase)
+void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
+							int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
+							const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase)
 {
 	//castRaysHost(rays,hitResults,numBodies,bodies,numCollidables,collidables,narrowphaseData);

 	B3_PROFILE("castRaysGPU");
-	
+
 	{
 		B3_PROFILE("raycast copyFromHost");
 		m_data->m_gpuRays->copyFromHost(rays);
 		m_data->m_gpuHitResults->copyFromHost(hitResults);
-		
 	}
-	
+
 	int numRays = hitResults.size();
 	{
 		m_data->m_firstRayRigidPairIndexPerRay->resize(numRays);
 		m_data->m_numRayRigidPairsPerRay->resize(numRays);
-		
+
 		m_data->m_gpuNumRayRigidPairs->resize(1);
 		m_data->m_gpuRayRigidPairs->resize(numRays * 16);
 	}
-	
+
 	//run kernel
 	const bool USE_BRUTE_FORCE_RAYCAST = false;
-	if(USE_BRUTE_FORCE_RAYCAST)
+	if (USE_BRUTE_FORCE_RAYCAST)
 	{
 		B3_PROFILE("raycast launch1D");

-		b3LauncherCL launcher(m_data->m_q,m_data->m_raytraceKernel,"m_raytraceKernel");
+		b3LauncherCL launcher(m_data->m_q, m_data->m_raytraceKernel, "m_raytraceKernel");
 		int numRays = rays.size();
 		launcher.setConst(numRays);

@@ -299,93 +287,88 @@ void b3GpuRaycast::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3Align
 		launcher.setBuffer(narrowphaseData->m_collidablesGPU->getBufferCL());
 		launcher.setBuffer(narrowphaseData->m_convexFacesGPU->getBufferCL());
 		launcher.setBuffer(narrowphaseData->m_convexPolyhedraGPU->getBufferCL());
-		
+
 		launcher.launch1D(numRays);
 		clFinish(m_data->m_q);
 	}
 	else
 	{
-		m_data->m_plbvh->build( broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU() );
+		m_data->m_plbvh->build(broadphase->getAllAabbsGPU(), broadphase->getSmallAabbIndicesGPU(), broadphase->getLargeAabbIndicesGPU());

 		m_data->m_plbvh->testRaysAgainstBvhAabbs(*m_data->m_gpuRays, *m_data->m_gpuNumRayRigidPairs, *m_data->m_gpuRayRigidPairs);
-		
+
 		int numRayRigidPairs = -1;
 		m_data->m_gpuNumRayRigidPairs->copyToHostPointer(&numRayRigidPairs, 1);
-		if( numRayRigidPairs > m_data->m_gpuRayRigidPairs->size() )
+		if (numRayRigidPairs > m_data->m_gpuRayRigidPairs->size())
 		{
 			numRayRigidPairs = m_data->m_gpuRayRigidPairs->size();
 			m_data->m_gpuNumRayRigidPairs->copyFromHostPointer(&numRayRigidPairs, 1);
 		}
-		
-		m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs);	//Radix sort needs b3OpenCLArray::size() to be correct
-		
+
+		m_data->m_gpuRayRigidPairs->resize(numRayRigidPairs);  //Radix sort needs b3OpenCLArray::size() to be correct
+
 		//Sort ray-rigid pairs by ray index
 		{
 			B3_PROFILE("sort ray-rigid pairs");
-			m_data->m_radixSorter->execute( *reinterpret_cast< b3OpenCLArray<b3SortData>* >(m_data->m_gpuRayRigidPairs) );
+			m_data->m_radixSorter->execute(*reinterpret_cast<b3OpenCLArray<b3SortData>*>(m_data->m_gpuRayRigidPairs));
 		}
-		
+
 		//detect start,count of each ray pair
 		{
 			B3_PROFILE("detect ray-rigid pair index ranges");
-			
+
 			{
 				B3_PROFILE("reset ray-rigid pair index ranges");
-				
-				m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays);	//atomic_min used to find first index
+
+				m_data->m_fill->execute(*m_data->m_firstRayRigidPairIndexPerRay, numRayRigidPairs, numRays);  //atomic_min used to find first index
 				m_data->m_fill->execute(*m_data->m_numRayRigidPairsPerRay, 0, numRays);
 				clFinish(m_data->m_q);
 			}
-			
-			b3BufferInfoCL bufferInfo[] = 
-			{
-				b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() ),
-				
-				b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
-				b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() )
-			};
-			
+
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL()),
+
+					b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
+					b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL())};
+
 			b3LauncherCL launcher(m_data->m_q, m_data->m_findRayRigidPairIndexRanges, "m_findRayRigidPairIndexRanges");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 			launcher.setConst(numRayRigidPairs);
-			
+
 			launcher.launch1D(numRayRigidPairs);
 			clFinish(m_data->m_q);
 		}
-		
+
 		{
 			B3_PROFILE("ray-rigid intersection");
-			
-			b3BufferInfoCL bufferInfo[] = 
-			{
-				b3BufferInfoCL( m_data->m_gpuRays->getBufferCL() ),
-				b3BufferInfoCL( m_data->m_gpuHitResults->getBufferCL() ),
-				b3BufferInfoCL( m_data->m_firstRayRigidPairIndexPerRay->getBufferCL() ),
-				b3BufferInfoCL( m_data->m_numRayRigidPairsPerRay->getBufferCL() ),
-				
-				b3BufferInfoCL( narrowphaseData->m_bodyBufferGPU->getBufferCL() ),
-				b3BufferInfoCL( narrowphaseData->m_collidablesGPU->getBufferCL() ),
-				b3BufferInfoCL( narrowphaseData->m_convexFacesGPU->getBufferCL() ),
-				b3BufferInfoCL( narrowphaseData->m_convexPolyhedraGPU->getBufferCL() ),
-				
-				b3BufferInfoCL( m_data->m_gpuRayRigidPairs->getBufferCL() )
-			};
-			
+
+			b3BufferInfoCL bufferInfo[] =
+				{
+					b3BufferInfoCL(m_data->m_gpuRays->getBufferCL()),
+					b3BufferInfoCL(m_data->m_gpuHitResults->getBufferCL()),
+					b3BufferInfoCL(m_data->m_firstRayRigidPairIndexPerRay->getBufferCL()),
+					b3BufferInfoCL(m_data->m_numRayRigidPairsPerRay->getBufferCL()),
+
+					b3BufferInfoCL(narrowphaseData->m_bodyBufferGPU->getBufferCL()),
+					b3BufferInfoCL(narrowphaseData->m_collidablesGPU->getBufferCL()),
+					b3BufferInfoCL(narrowphaseData->m_convexFacesGPU->getBufferCL()),
+					b3BufferInfoCL(narrowphaseData->m_convexPolyhedraGPU->getBufferCL()),
+
+					b3BufferInfoCL(m_data->m_gpuRayRigidPairs->getBufferCL())};
+
 			b3LauncherCL launcher(m_data->m_q, m_data->m_raytracePairsKernel, "m_raytracePairsKernel");
-			launcher.setBuffers( bufferInfo, sizeof(bufferInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setBuffers(bufferInfo, sizeof(bufferInfo) / sizeof(b3BufferInfoCL));
 			launcher.setConst(numRays);
-			
+
 			launcher.launch1D(numRays);
 			clFinish(m_data->m_q);
 		}
 	}
-	
-	

 	//copy results
 	{
 		B3_PROFILE("raycast copyToHost");
 		m_data->m_gpuHitResults->copyToHost(hitResults);
 	}
-
 }
--- a/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h
+++ b/src/Bullet3OpenCL/Raycast/b3GpuRaycast.h
@@ -7,26 +7,22 @@
 #include "Bullet3Common/b3AlignedObjectArray.h"
 #include "Bullet3Collision/NarrowPhaseCollision/b3RaycastInfo.h"

-
-
 class b3GpuRaycast
 {
 protected:
 	struct b3GpuRaycastInternalData* m_data;
+
 public:
-	b3GpuRaycast(cl_context ctx,cl_device_id device, cl_command_queue  q);
+	b3GpuRaycast(cl_context ctx, cl_device_id device, cl_command_queue q);
 	virtual ~b3GpuRaycast();

-	void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn,	b3AlignedObjectArray<b3RayHit>& hitResults,
-		int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
-		const struct b3GpuNarrowPhaseInternalData* narrowphaseData);
+	void castRaysHost(const b3AlignedObjectArray<b3RayInfo>& raysIn, b3AlignedObjectArray<b3RayHit>& hitResults,
+					  int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
+					  const struct b3GpuNarrowPhaseInternalData* narrowphaseData);

-	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults,
-		int numBodies,const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
-		const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
-	
-
-		
+	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults,
+				  int numBodies, const struct b3RigidBodyData* bodies, int numCollidables, const struct b3Collidable* collidables,
+				  const struct b3GpuNarrowPhaseInternalData* narrowphaseData, class b3GpuBroadphaseInterface* broadphase);
 };

-#endif //B3_GPU_RAYCAST_H
+#endif  //B3_GPU_RAYCAST_H
--- a/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
+++ b/src/Bullet3OpenCL/Raycast/kernels/rayCastKernels.h
@@ -1,381 +1,380 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* rayCastKernelCL= \
-"#define SHAPE_CONVEX_HULL 3\n"
-"#define SHAPE_PLANE 4\n"
-"#define SHAPE_CONCAVE_TRIMESH 5\n"
-"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
-"#define SHAPE_SPHERE 7\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_from;\n"
-"	float4 m_to;\n"
-"} b3RayInfo;\n"
-"typedef struct\n"
-"{\n"
-"	float m_hitFraction;\n"
-"	int	m_hitResult0;\n"
-"	int	m_hitResult1;\n"
-"	int	m_hitResult2;\n"
-"	float4	m_hitPoint;\n"
-"	float4	m_hitNormal;\n"
-"} b3RayHit;\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_pos;\n"
-"	float4 m_quat;\n"
-"	float4 m_linVel;\n"
-"	float4 m_angVel;\n"
-"	unsigned int m_collidableIdx;\n"
-"	float m_invMass;\n"
-"	float m_restituitionCoeff;\n"
-"	float m_frictionCoeff;\n"
-"} Body;\n"
-"typedef struct Collidable\n"
-"{\n"
-"	union {\n"
-"		int m_numChildShapes;\n"
-"		int m_bvhIndex;\n"
-"	};\n"
-"	float m_radius;\n"
-"	int m_shapeType;\n"
-"	int m_shapeIndex;\n"
-"} Collidable;\n"
-"typedef struct  \n"
-"{\n"
-"	float4		m_localCenter;\n"
-"	float4		m_extents;\n"
-"	float4		mC;\n"
-"	float4		mE;\n"
-"	float			m_radius;\n"
-"	int	m_faceOffset;\n"
-"	int m_numFaces;\n"
-"	int	m_numVertices;\n"
-"	int m_vertexOffset;\n"
-"	int	m_uniqueEdgesOffset;\n"
-"	int	m_numUniqueEdges;\n"
-"	int m_unused;\n"
-"} ConvexPolyhedronCL;\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_plane;\n"
-"	int m_indexOffset;\n"
-"	int m_numIndices;\n"
-"} b3GpuFace;\n"
-"///////////////////////////////////////\n"
-"//	Quaternion\n"
-"///////////////////////////////////////\n"
-"typedef float4 Quaternion;\n"
-"__inline\n"
-"	Quaternion qtMul(Quaternion a, Quaternion b);\n"
-"__inline\n"
-"	Quaternion qtNormalize(Quaternion in);\n"
-"__inline\n"
-"	Quaternion qtInvert(Quaternion q);\n"
-"__inline\n"
-"	float dot3F4(float4 a, float4 b)\n"
-"{\n"
-"	float4 a1 = (float4)(a.xyz,0.f);\n"
-"	float4 b1 = (float4)(b.xyz,0.f);\n"
-"	return dot(a1, b1);\n"
-"}\n"
-"__inline\n"
-"	Quaternion qtMul(Quaternion a, Quaternion b)\n"
-"{\n"
-"	Quaternion ans;\n"
-"	ans = cross( a, b );\n"
-"	ans += a.w*b+b.w*a;\n"
-"	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
-"	ans.w = a.w*b.w - dot3F4(a, b);\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"	Quaternion qtNormalize(Quaternion in)\n"
-"{\n"
-"	return fast_normalize(in);\n"
-"	//	in /= length( in );\n"
-"	//	return in;\n"
-"}\n"
-"__inline\n"
-"	float4 qtRotate(Quaternion q, float4 vec)\n"
-"{\n"
-"	Quaternion qInv = qtInvert( q );\n"
-"	float4 vcpy = vec;\n"
-"	vcpy.w = 0.f;\n"
-"	float4 out = qtMul(q,vcpy);\n"
-"	out = qtMul(out,qInv);\n"
-"	return out;\n"
-"}\n"
-"__inline\n"
-"	Quaternion qtInvert(Quaternion q)\n"
-"{\n"
-"	return (Quaternion)(-q.xyz, q.w);\n"
-"}\n"
-"__inline\n"
-"	float4 qtInvRotate(const Quaternion q, float4 vec)\n"
-"{\n"
-"	return qtRotate( qtInvert( q ), vec );\n"
-"}\n"
-"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
-"	float4* translationOut, Quaternion* orientationOut)\n"
-"{\n"
-"	*orientationOut = qtInvert(orientationIn);\n"
-"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
-"}\n"
-"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
-"	__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
-"{\n"
-"	rayFromLocal.w = 0.f;\n"
-"	rayToLocal.w = 0.f;\n"
-"	bool result = true;\n"
-"	float exitFraction = hitFraction[0];\n"
-"	float enterFraction = -0.3f;\n"
-"	float4 curHitNormal = (float4)(0,0,0,0);\n"
-"	for (int i=0;i<numFaces && result;i++)\n"
-"	{\n"
-"		b3GpuFace face = faces[faceOffset+i];\n"
-"		float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n"
-"		float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n"
-"		if (fromPlaneDist<0.f)\n"
-"		{\n"
-"			if (toPlaneDist >= 0.f)\n"
-"			{\n"
-"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
-"				if (exitFraction>fraction)\n"
-"				{\n"
-"					exitFraction = fraction;\n"
-"				}\n"
-"			} 			\n"
-"		} else\n"
-"		{\n"
-"			if (toPlaneDist<0.f)\n"
-"			{\n"
-"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
-"				if (enterFraction <= fraction)\n"
-"				{\n"
-"					enterFraction = fraction;\n"
-"					curHitNormal = face.m_plane;\n"
-"					curHitNormal.w = 0.f;\n"
-"				}\n"
-"			} else\n"
-"			{\n"
-"				result = false;\n"
-"			}\n"
-"		}\n"
-"		if (exitFraction <= enterFraction)\n"
-"			result = false;\n"
-"	}\n"
-"	if (enterFraction < 0.f)\n"
-"	{\n"
-"		result = false;\n"
-"	}\n"
-"	if (result)\n"
-"	{	\n"
-"		hitFraction[0] = enterFraction;\n"
-"		hitNormal[0] = curHitNormal;\n"
-"	}\n"
-"	return result;\n"
-"}\n"
-"bool sphere_intersect(float4 spherePos,  float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
-"{\n"
-"	float4 rs = rayFrom - spherePos;\n"
-"	rs.w = 0.f;\n"
-"	float4 rayDir = rayTo-rayFrom;\n"
-"	rayDir.w = 0.f;\n"
-"	float A = dot(rayDir,rayDir);\n"
-"	float B = dot(rs, rayDir);\n"
-"	float C = dot(rs, rs) - (radius * radius);\n"
-"	float D = B * B - A*C;\n"
-"	if (D > 0.0f)\n"
-"	{\n"
-"		float t = (-B - sqrt(D))/A;\n"
-"		if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
-"		{\n"
-"			*hitFraction = t;\n"
-"			return true;\n"
-"		}\n"
-"	}\n"
-"	return false;\n"
-"}\n"
-"float4 setInterpolate3(float4 from, float4 to, float t)\n"
-"{\n"
-"	float s = 1.0f - t;\n"
-"	float4 result;\n"
-"	result = s * from + t * to;\n"
-"	result.w = 0.f;	\n"
-"	return result;	\n"
-"}\n"
-"__kernel void rayCastKernel(  \n"
-"	int numRays, \n"
-"	const __global b3RayInfo* rays, \n"
-"	__global b3RayHit* hitResults, \n"
-"	const int numBodies, \n"
-"	__global Body* bodies,\n"
-"	__global Collidable* collidables,\n"
-"	__global const b3GpuFace* faces,\n"
-"	__global const ConvexPolyhedronCL* convexShapes	)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numRays)\n"
-"		return;\n"
-"	hitResults[i].m_hitFraction = 1.f;\n"
-"	float4 rayFrom = rays[i].m_from;\n"
-"	float4 rayTo = rays[i].m_to;\n"
-"	float hitFraction = 1.f;\n"
-"	float4 hitPoint;\n"
-"	float4 hitNormal;\n"
-"	int hitBodyIndex= -1;\n"
-"	int cachedCollidableIndex = -1;\n"
-"	Collidable cachedCollidable;\n"
-"	for (int b=0;b<numBodies;b++)\n"
-"	{\n"
-"		if (hitResults[i].m_hitResult2==b)\n"
-"			continue;\n"
-"		Body body = bodies[b];\n"
-"		float4 pos = body.m_pos;\n"
-"		float4 orn = body.m_quat;\n"
-"		if (cachedCollidableIndex != body.m_collidableIdx)\n"
-"		{\n"
-"			cachedCollidableIndex = body.m_collidableIdx;\n"
-"			cachedCollidable = collidables[cachedCollidableIndex];\n"
-"		}\n"
-"		if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
-"		{\n"
-"			float4 invPos = (float4)(0,0,0,0);\n"
-"			float4 invOrn = (float4)(0,0,0,0);\n"
-"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
-"			float4 rayToLocal = (float4)(0,0,0,0);\n"
-"			invOrn = qtInvert(orn);\n"
-"			invPos = qtRotate(invOrn, -pos);\n"
-"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
-"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
-"			rayFromLocal.w = 0.f;\n"
-"			rayToLocal.w = 0.f;\n"
-"			int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n"
-"			int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n"
-"			if (numFaces)\n"
-"			{\n"
-"				if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
-"				{\n"
-"					hitBodyIndex = b;\n"
-"					\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n"
-"		{\n"
-"			float radius = cachedCollidable.m_radius;\n"
-"		\n"
-"			if (sphere_intersect(pos,  radius, rayFrom, rayTo, &hitFraction))\n"
-"			{\n"
-"				hitBodyIndex = b;\n"
-"				hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"	if (hitBodyIndex>=0)\n"
-"	{\n"
-"		hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
-"		hitResults[i].m_hitFraction = hitFraction;\n"
-"		hitResults[i].m_hitPoint = hitPoint;\n"
-"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
-"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
-"	}\n"
-"}\n"
-"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
-"											__global int* out_firstRayRigidPairIndexPerRay,\n"
-"											__global int* out_numRayRigidPairsPerRay,\n"
-"											int numRayRigidPairs)\n"
-"{\n"
-"	int rayRigidPairIndex = get_global_id(0);\n"
-"	if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
-"	\n"
-"	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
-"	\n"
-"	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
-"	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
-"}\n"
-"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
-"								__global b3RayHit* hitResults, \n"
-"								__global int* firstRayRigidPairIndexPerRay,\n"
-"								__global int* numRayRigidPairsPerRay,\n"
-"									\n"
-"								__global Body* bodies,\n"
-"								__global Collidable* collidables,\n"
-"								__global const b3GpuFace* faces,\n"
-"								__global const ConvexPolyhedronCL* convexShapes,\n"
-"								\n"
-"								__global int2* rayRigidPairs,\n"
-"								int numRays)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i >= numRays) return;\n"
-"	\n"
-"	float4 rayFrom = rays[i].m_from;\n"
-"	float4 rayTo = rays[i].m_to;\n"
-"		\n"
-"	hitResults[i].m_hitFraction = 1.f;\n"
-"		\n"
-"	float hitFraction = 1.f;\n"
-"	float4 hitPoint;\n"
-"	float4 hitNormal;\n"
-"	int hitBodyIndex = -1;\n"
-"		\n"
-"	//\n"
-"	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
-"	{\n"
-"		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
-"		int b = rayRigidPairs[rayRigidPairIndex].y;\n"
-"		\n"
-"		if (hitResults[i].m_hitResult2 == b) continue;\n"
-"		\n"
-"		Body body = bodies[b];\n"
-"		Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
-"		\n"
-"		float4 pos = body.m_pos;\n"
-"		float4 orn = body.m_quat;\n"
-"		\n"
-"		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
-"		{\n"
-"			float4 invPos = (float4)(0,0,0,0);\n"
-"			float4 invOrn = (float4)(0,0,0,0);\n"
-"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
-"			float4 rayToLocal = (float4)(0,0,0,0);\n"
-"			invOrn = qtInvert(orn);\n"
-"			invPos = qtRotate(invOrn, -pos);\n"
-"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
-"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
-"			rayFromLocal.w = 0.f;\n"
-"			rayToLocal.w = 0.f;\n"
-"			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
-"			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
-"			\n"
-"			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
-"			{\n"
-"				hitBodyIndex = b;\n"
-"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
-"		{\n"
-"			float radius = rigidCollidable.m_radius;\n"
-"		\n"
-"			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
-"			{\n"
-"				hitBodyIndex = b;\n"
-"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
-"				hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"	\n"
-"	if (hitBodyIndex >= 0)\n"
-"	{\n"
-"		hitResults[i].m_hitFraction = hitFraction;\n"
-"		hitResults[i].m_hitPoint = hitPoint;\n"
-"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
-"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
-"	}\n"
-"	\n"
-"}\n"
-;
+static const char* rayCastKernelCL =
+	"#define SHAPE_CONVEX_HULL 3\n"
+	"#define SHAPE_PLANE 4\n"
+	"#define SHAPE_CONCAVE_TRIMESH 5\n"
+	"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+	"#define SHAPE_SPHERE 7\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_from;\n"
+	"	float4 m_to;\n"
+	"} b3RayInfo;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float m_hitFraction;\n"
+	"	int	m_hitResult0;\n"
+	"	int	m_hitResult1;\n"
+	"	int	m_hitResult2;\n"
+	"	float4	m_hitPoint;\n"
+	"	float4	m_hitNormal;\n"
+	"} b3RayHit;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_pos;\n"
+	"	float4 m_quat;\n"
+	"	float4 m_linVel;\n"
+	"	float4 m_angVel;\n"
+	"	unsigned int m_collidableIdx;\n"
+	"	float m_invMass;\n"
+	"	float m_restituitionCoeff;\n"
+	"	float m_frictionCoeff;\n"
+	"} Body;\n"
+	"typedef struct Collidable\n"
+	"{\n"
+	"	union {\n"
+	"		int m_numChildShapes;\n"
+	"		int m_bvhIndex;\n"
+	"	};\n"
+	"	float m_radius;\n"
+	"	int m_shapeType;\n"
+	"	int m_shapeIndex;\n"
+	"} Collidable;\n"
+	"typedef struct  \n"
+	"{\n"
+	"	float4		m_localCenter;\n"
+	"	float4		m_extents;\n"
+	"	float4		mC;\n"
+	"	float4		mE;\n"
+	"	float			m_radius;\n"
+	"	int	m_faceOffset;\n"
+	"	int m_numFaces;\n"
+	"	int	m_numVertices;\n"
+	"	int m_vertexOffset;\n"
+	"	int	m_uniqueEdgesOffset;\n"
+	"	int	m_numUniqueEdges;\n"
+	"	int m_unused;\n"
+	"} ConvexPolyhedronCL;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_plane;\n"
+	"	int m_indexOffset;\n"
+	"	int m_numIndices;\n"
+	"} b3GpuFace;\n"
+	"///////////////////////////////////////\n"
+	"//	Quaternion\n"
+	"///////////////////////////////////////\n"
+	"typedef float4 Quaternion;\n"
+	"__inline\n"
+	"	Quaternion qtMul(Quaternion a, Quaternion b);\n"
+	"__inline\n"
+	"	Quaternion qtNormalize(Quaternion in);\n"
+	"__inline\n"
+	"	Quaternion qtInvert(Quaternion q);\n"
+	"__inline\n"
+	"	float dot3F4(float4 a, float4 b)\n"
+	"{\n"
+	"	float4 a1 = (float4)(a.xyz,0.f);\n"
+	"	float4 b1 = (float4)(b.xyz,0.f);\n"
+	"	return dot(a1, b1);\n"
+	"}\n"
+	"__inline\n"
+	"	Quaternion qtMul(Quaternion a, Quaternion b)\n"
+	"{\n"
+	"	Quaternion ans;\n"
+	"	ans = cross( a, b );\n"
+	"	ans += a.w*b+b.w*a;\n"
+	"	//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+	"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+	"	return ans;\n"
+	"}\n"
+	"__inline\n"
+	"	Quaternion qtNormalize(Quaternion in)\n"
+	"{\n"
+	"	return fast_normalize(in);\n"
+	"	//	in /= length( in );\n"
+	"	//	return in;\n"
+	"}\n"
+	"__inline\n"
+	"	float4 qtRotate(Quaternion q, float4 vec)\n"
+	"{\n"
+	"	Quaternion qInv = qtInvert( q );\n"
+	"	float4 vcpy = vec;\n"
+	"	vcpy.w = 0.f;\n"
+	"	float4 out = qtMul(q,vcpy);\n"
+	"	out = qtMul(out,qInv);\n"
+	"	return out;\n"
+	"}\n"
+	"__inline\n"
+	"	Quaternion qtInvert(Quaternion q)\n"
+	"{\n"
+	"	return (Quaternion)(-q.xyz, q.w);\n"
+	"}\n"
+	"__inline\n"
+	"	float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+	"{\n"
+	"	return qtRotate( qtInvert( q ), vec );\n"
+	"}\n"
+	"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
+	"	float4* translationOut, Quaternion* orientationOut)\n"
+	"{\n"
+	"	*orientationOut = qtInvert(orientationIn);\n"
+	"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
+	"}\n"
+	"bool rayConvex(float4 rayFromLocal, float4 rayToLocal, int numFaces, int faceOffset,\n"
+	"	__global const b3GpuFace* faces, float* hitFraction, float4* hitNormal)\n"
+	"{\n"
+	"	rayFromLocal.w = 0.f;\n"
+	"	rayToLocal.w = 0.f;\n"
+	"	bool result = true;\n"
+	"	float exitFraction = hitFraction[0];\n"
+	"	float enterFraction = -0.3f;\n"
+	"	float4 curHitNormal = (float4)(0,0,0,0);\n"
+	"	for (int i=0;i<numFaces && result;i++)\n"
+	"	{\n"
+	"		b3GpuFace face = faces[faceOffset+i];\n"
+	"		float fromPlaneDist = dot(rayFromLocal,face.m_plane)+face.m_plane.w;\n"
+	"		float toPlaneDist = dot(rayToLocal,face.m_plane)+face.m_plane.w;\n"
+	"		if (fromPlaneDist<0.f)\n"
+	"		{\n"
+	"			if (toPlaneDist >= 0.f)\n"
+	"			{\n"
+	"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
+	"				if (exitFraction>fraction)\n"
+	"				{\n"
+	"					exitFraction = fraction;\n"
+	"				}\n"
+	"			} 			\n"
+	"		} else\n"
+	"		{\n"
+	"			if (toPlaneDist<0.f)\n"
+	"			{\n"
+	"				float fraction = fromPlaneDist / (fromPlaneDist-toPlaneDist);\n"
+	"				if (enterFraction <= fraction)\n"
+	"				{\n"
+	"					enterFraction = fraction;\n"
+	"					curHitNormal = face.m_plane;\n"
+	"					curHitNormal.w = 0.f;\n"
+	"				}\n"
+	"			} else\n"
+	"			{\n"
+	"				result = false;\n"
+	"			}\n"
+	"		}\n"
+	"		if (exitFraction <= enterFraction)\n"
+	"			result = false;\n"
+	"	}\n"
+	"	if (enterFraction < 0.f)\n"
+	"	{\n"
+	"		result = false;\n"
+	"	}\n"
+	"	if (result)\n"
+	"	{	\n"
+	"		hitFraction[0] = enterFraction;\n"
+	"		hitNormal[0] = curHitNormal;\n"
+	"	}\n"
+	"	return result;\n"
+	"}\n"
+	"bool sphere_intersect(float4 spherePos,  float radius, float4 rayFrom, float4 rayTo, float* hitFraction)\n"
+	"{\n"
+	"	float4 rs = rayFrom - spherePos;\n"
+	"	rs.w = 0.f;\n"
+	"	float4 rayDir = rayTo-rayFrom;\n"
+	"	rayDir.w = 0.f;\n"
+	"	float A = dot(rayDir,rayDir);\n"
+	"	float B = dot(rs, rayDir);\n"
+	"	float C = dot(rs, rs) - (radius * radius);\n"
+	"	float D = B * B - A*C;\n"
+	"	if (D > 0.0f)\n"
+	"	{\n"
+	"		float t = (-B - sqrt(D))/A;\n"
+	"		if ( (t >= 0.0f) && (t < (*hitFraction)) )\n"
+	"		{\n"
+	"			*hitFraction = t;\n"
+	"			return true;\n"
+	"		}\n"
+	"	}\n"
+	"	return false;\n"
+	"}\n"
+	"float4 setInterpolate3(float4 from, float4 to, float t)\n"
+	"{\n"
+	"	float s = 1.0f - t;\n"
+	"	float4 result;\n"
+	"	result = s * from + t * to;\n"
+	"	result.w = 0.f;	\n"
+	"	return result;	\n"
+	"}\n"
+	"__kernel void rayCastKernel(  \n"
+	"	int numRays, \n"
+	"	const __global b3RayInfo* rays, \n"
+	"	__global b3RayHit* hitResults, \n"
+	"	const int numBodies, \n"
+	"	__global Body* bodies,\n"
+	"	__global Collidable* collidables,\n"
+	"	__global const b3GpuFace* faces,\n"
+	"	__global const ConvexPolyhedronCL* convexShapes	)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i>=numRays)\n"
+	"		return;\n"
+	"	hitResults[i].m_hitFraction = 1.f;\n"
+	"	float4 rayFrom = rays[i].m_from;\n"
+	"	float4 rayTo = rays[i].m_to;\n"
+	"	float hitFraction = 1.f;\n"
+	"	float4 hitPoint;\n"
+	"	float4 hitNormal;\n"
+	"	int hitBodyIndex= -1;\n"
+	"	int cachedCollidableIndex = -1;\n"
+	"	Collidable cachedCollidable;\n"
+	"	for (int b=0;b<numBodies;b++)\n"
+	"	{\n"
+	"		if (hitResults[i].m_hitResult2==b)\n"
+	"			continue;\n"
+	"		Body body = bodies[b];\n"
+	"		float4 pos = body.m_pos;\n"
+	"		float4 orn = body.m_quat;\n"
+	"		if (cachedCollidableIndex != body.m_collidableIdx)\n"
+	"		{\n"
+	"			cachedCollidableIndex = body.m_collidableIdx;\n"
+	"			cachedCollidable = collidables[cachedCollidableIndex];\n"
+	"		}\n"
+	"		if (cachedCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
+	"		{\n"
+	"			float4 invPos = (float4)(0,0,0,0);\n"
+	"			float4 invOrn = (float4)(0,0,0,0);\n"
+	"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
+	"			float4 rayToLocal = (float4)(0,0,0,0);\n"
+	"			invOrn = qtInvert(orn);\n"
+	"			invPos = qtRotate(invOrn, -pos);\n"
+	"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
+	"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
+	"			rayFromLocal.w = 0.f;\n"
+	"			rayToLocal.w = 0.f;\n"
+	"			int numFaces = convexShapes[cachedCollidable.m_shapeIndex].m_numFaces;\n"
+	"			int faceOffset = convexShapes[cachedCollidable.m_shapeIndex].m_faceOffset;\n"
+	"			if (numFaces)\n"
+	"			{\n"
+	"				if (rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
+	"				{\n"
+	"					hitBodyIndex = b;\n"
+	"					\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		if (cachedCollidable.m_shapeType == SHAPE_SPHERE)\n"
+	"		{\n"
+	"			float radius = cachedCollidable.m_radius;\n"
+	"		\n"
+	"			if (sphere_intersect(pos,  radius, rayFrom, rayTo, &hitFraction))\n"
+	"			{\n"
+	"				hitBodyIndex = b;\n"
+	"				hitNormal = (float4) (hitPoint-bodies[b].m_pos);\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"	if (hitBodyIndex>=0)\n"
+	"	{\n"
+	"		hitPoint = setInterpolate3(rayFrom, rayTo,hitFraction);\n"
+	"		hitResults[i].m_hitFraction = hitFraction;\n"
+	"		hitResults[i].m_hitPoint = hitPoint;\n"
+	"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
+	"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
+	"	}\n"
+	"}\n"
+	"__kernel void findRayRigidPairIndexRanges(__global int2* rayRigidPairs, \n"
+	"											__global int* out_firstRayRigidPairIndexPerRay,\n"
+	"											__global int* out_numRayRigidPairsPerRay,\n"
+	"											int numRayRigidPairs)\n"
+	"{\n"
+	"	int rayRigidPairIndex = get_global_id(0);\n"
+	"	if (rayRigidPairIndex >= numRayRigidPairs) return;\n"
+	"	\n"
+	"	int rayIndex = rayRigidPairs[rayRigidPairIndex].x;\n"
+	"	\n"
+	"	atomic_min(&out_firstRayRigidPairIndexPerRay[rayIndex], rayRigidPairIndex);\n"
+	"	atomic_inc(&out_numRayRigidPairsPerRay[rayIndex]);\n"
+	"}\n"
+	"__kernel void rayCastPairsKernel(const __global b3RayInfo* rays, \n"
+	"								__global b3RayHit* hitResults, \n"
+	"								__global int* firstRayRigidPairIndexPerRay,\n"
+	"								__global int* numRayRigidPairsPerRay,\n"
+	"									\n"
+	"								__global Body* bodies,\n"
+	"								__global Collidable* collidables,\n"
+	"								__global const b3GpuFace* faces,\n"
+	"								__global const ConvexPolyhedronCL* convexShapes,\n"
+	"								\n"
+	"								__global int2* rayRigidPairs,\n"
+	"								int numRays)\n"
+	"{\n"
+	"	int i = get_global_id(0);\n"
+	"	if (i >= numRays) return;\n"
+	"	\n"
+	"	float4 rayFrom = rays[i].m_from;\n"
+	"	float4 rayTo = rays[i].m_to;\n"
+	"		\n"
+	"	hitResults[i].m_hitFraction = 1.f;\n"
+	"		\n"
+	"	float hitFraction = 1.f;\n"
+	"	float4 hitPoint;\n"
+	"	float4 hitNormal;\n"
+	"	int hitBodyIndex = -1;\n"
+	"		\n"
+	"	//\n"
+	"	for(int pair = 0; pair < numRayRigidPairsPerRay[i]; ++pair)\n"
+	"	{\n"
+	"		int rayRigidPairIndex = pair + firstRayRigidPairIndexPerRay[i];\n"
+	"		int b = rayRigidPairs[rayRigidPairIndex].y;\n"
+	"		\n"
+	"		if (hitResults[i].m_hitResult2 == b) continue;\n"
+	"		\n"
+	"		Body body = bodies[b];\n"
+	"		Collidable rigidCollidable = collidables[body.m_collidableIdx];\n"
+	"		\n"
+	"		float4 pos = body.m_pos;\n"
+	"		float4 orn = body.m_quat;\n"
+	"		\n"
+	"		if (rigidCollidable.m_shapeType == SHAPE_CONVEX_HULL)\n"
+	"		{\n"
+	"			float4 invPos = (float4)(0,0,0,0);\n"
+	"			float4 invOrn = (float4)(0,0,0,0);\n"
+	"			float4 rayFromLocal = (float4)(0,0,0,0);\n"
+	"			float4 rayToLocal = (float4)(0,0,0,0);\n"
+	"			invOrn = qtInvert(orn);\n"
+	"			invPos = qtRotate(invOrn, -pos);\n"
+	"			rayFromLocal = qtRotate( invOrn, rayFrom ) + invPos;\n"
+	"			rayToLocal = qtRotate( invOrn, rayTo) + invPos;\n"
+	"			rayFromLocal.w = 0.f;\n"
+	"			rayToLocal.w = 0.f;\n"
+	"			int numFaces = convexShapes[rigidCollidable.m_shapeIndex].m_numFaces;\n"
+	"			int faceOffset = convexShapes[rigidCollidable.m_shapeIndex].m_faceOffset;\n"
+	"			\n"
+	"			if (numFaces && rayConvex(rayFromLocal, rayToLocal, numFaces, faceOffset,faces, &hitFraction, &hitNormal))\n"
+	"			{\n"
+	"				hitBodyIndex = b;\n"
+	"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		if (rigidCollidable.m_shapeType == SHAPE_SPHERE)\n"
+	"		{\n"
+	"			float radius = rigidCollidable.m_radius;\n"
+	"		\n"
+	"			if (sphere_intersect(pos, radius, rayFrom, rayTo, &hitFraction))\n"
+	"			{\n"
+	"				hitBodyIndex = b;\n"
+	"				hitPoint = setInterpolate3(rayFrom, rayTo, hitFraction);\n"
+	"				hitNormal = (float4) (hitPoint - bodies[b].m_pos);\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"	\n"
+	"	if (hitBodyIndex >= 0)\n"
+	"	{\n"
+	"		hitResults[i].m_hitFraction = hitFraction;\n"
+	"		hitResults[i].m_hitPoint = hitPoint;\n"
+	"		hitResults[i].m_hitNormal = normalize(hitNormal);\n"
+	"		hitResults[i].m_hitResult0 = hitBodyIndex;\n"
+	"	}\n"
+	"	\n"
+	"}\n";
--- a/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuConstraint4.h
@@ -5,14 +5,13 @@

 #include "Bullet3Dynamics/shared/b3ContactConstraint4.h"

-
-B3_ATTRIBUTE_ALIGNED16(struct) b3GpuConstraint4 : public b3ContactConstraint4
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3GpuConstraint4 : public b3ContactConstraint4
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();

-	inline	void setFrictionCoeff(float value) { m_linear[3] = value; }
-	inline	float getFrictionCoeff() const { return m_linear[3]; }
+	inline void setFrictionCoeff(float value) { m_linear[3] = value; }
+	inline float getFrictionCoeff() const { return m_linear[3]; }
 };

-#endif //B3_CONSTRAINT4_h
-
+#endif  //B3_CONSTRAINT4_h
--- a/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.cpp
@@ -19,11 +19,11 @@ subject to the following restrictions:
 #include <new>
 #include "Bullet3Common/b3Transform.h"

-void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData* bodies)
+void b3GpuGenericConstraint::getInfo1(unsigned int* info, const b3RigidBodyData* bodies)
 {
 	switch (m_constraintType)
 	{
-	case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
+		case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
 		{
 			*info = 3;
 			break;
@@ -35,7 +35,7 @@ void b3GpuGenericConstraint::getInfo1 (unsigned int* info,const b3RigidBodyData*
 	};
 }

-void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info,  const b3RigidBodyData* bodies)
+void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
 {
 	b3Transform trA;
 	trA.setIdentity();
@@ -47,54 +47,52 @@ void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo
 	trB.setOrigin(bodies[constraint->m_rbB].m_pos);
 	trB.setRotation(bodies[constraint->m_rbB].m_quat);

-		// anchor points in global coordinates with respect to body PORs.
-   
-    // set jacobian
-    info->m_J1linearAxis[0] = 1;
-	info->m_J1linearAxis[info->rowskip+1] = 1;
-	info->m_J1linearAxis[2*info->rowskip+2] = 1;
+	// anchor points in global coordinates with respect to body PORs.

-	b3Vector3 a1 = trA.getBasis()*constraint->getPivotInA();
+	// set jacobian
+	info->m_J1linearAxis[0] = 1;
+	info->m_J1linearAxis[info->rowskip + 1] = 1;
+	info->m_J1linearAxis[2 * info->rowskip + 2] = 1;
+
+	b3Vector3 a1 = trA.getBasis() * constraint->getPivotInA();
 	//b3Vector3 a1a = b3QuatRotate(trA.getRotation(),constraint->getPivotInA());

 	{
 		b3Vector3* angular0 = (b3Vector3*)(info->m_J1angularAxis);
-		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis+info->rowskip);
-		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis+2*info->rowskip);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J1angularAxis + info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J1angularAxis + 2 * info->rowskip);
 		b3Vector3 a1neg = -a1;
-		a1neg.getSkewSymmetricMatrix(angular0,angular1,angular2);
+		a1neg.getSkewSymmetricMatrix(angular0, angular1, angular2);
 	}
-    
+
 	if (info->m_J2linearAxis)
 	{
 		info->m_J2linearAxis[0] = -1;
-		info->m_J2linearAxis[info->rowskip+1] = -1;
-		info->m_J2linearAxis[2*info->rowskip+2] = -1;
+		info->m_J2linearAxis[info->rowskip + 1] = -1;
+		info->m_J2linearAxis[2 * info->rowskip + 2] = -1;
 	}
-	
-	b3Vector3 a2 = trB.getBasis()*constraint->getPivotInB();
-   
+
+	b3Vector3 a2 = trB.getBasis() * constraint->getPivotInB();
+
 	{
-	//	b3Vector3 a2n = -a2;
+		//	b3Vector3 a2n = -a2;
 		b3Vector3* angular0 = (b3Vector3*)(info->m_J2angularAxis);
-		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis+info->rowskip);
-		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis+2*info->rowskip);
-		a2.getSkewSymmetricMatrix(angular0,angular1,angular2);
+		b3Vector3* angular1 = (b3Vector3*)(info->m_J2angularAxis + info->rowskip);
+		b3Vector3* angular2 = (b3Vector3*)(info->m_J2angularAxis + 2 * info->rowskip);
+		a2.getSkewSymmetricMatrix(angular0, angular1, angular2);
 	}
-    

-
-    // set right hand side
-//	b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
+	// set right hand side
+	//	b3Scalar currERP = (m_flags & B3_P2P_FLAGS_ERP) ? m_erp : info->erp;
 	b3Scalar currERP = info->erp;

 	b3Scalar k = info->fps * currERP;
-    int j;
-	for (j=0; j<3; j++)
-    {
-        info->m_constraintError[j*info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]);
+	int j;
+	for (j = 0; j < 3; j++)
+	{
+		info->m_constraintError[j * info->rowskip] = k * (a2[j] + trB.getOrigin()[j] - a1[j] - trA.getOrigin()[j]);
 		//printf("info->m_constraintError[%d]=%f\n",j,info->m_constraintError[j]);
-    }
+	}
 #if 0
 	if(m_flags & B3_P2P_FLAGS_CFM)
 	{
@@ -117,21 +115,20 @@ void getInfo2Point2Point(b3GpuGenericConstraint* constraint, b3GpuConstraintInfo
 	}
 	info->m_damping = m_setting.m_damping;
 #endif
-
 }

-void b3GpuGenericConstraint::getInfo2 (b3GpuConstraintInfo2* info,  const b3RigidBodyData* bodies)
+void b3GpuGenericConstraint::getInfo2(b3GpuConstraintInfo2* info, const b3RigidBodyData* bodies)
 {
 	switch (m_constraintType)
 	{
-	case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
+		case B3_GPU_POINT2POINT_CONSTRAINT_TYPE:
 		{
-			getInfo2Point2Point(this,info,bodies);
+			getInfo2Point2Point(this, info, bodies);
 			break;
 		};
 		default:
-			{
-				b3Assert(0);
-			}
+		{
+			b3Assert(0);
+		}
 	};
 }
--- a/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h
@@ -20,37 +20,35 @@ subject to the following restrictions:
 struct b3RigidBodyData;
 enum B3_CONSTRAINT_FLAGS
 {
-	B3_CONSTRAINT_FLAG_ENABLED=1,
+	B3_CONSTRAINT_FLAG_ENABLED = 1,
 };

 enum b3GpuGenericConstraintType
 {
-	B3_GPU_POINT2POINT_CONSTRAINT_TYPE=3,
-	B3_GPU_FIXED_CONSTRAINT_TYPE=4,
-//	B3_HINGE_CONSTRAINT_TYPE,
-//	B3_CONETWIST_CONSTRAINT_TYPE,
-//	B3_D6_CONSTRAINT_TYPE,
-//	B3_SLIDER_CONSTRAINT_TYPE,
-//	B3_CONTACT_CONSTRAINT_TYPE,
-//	B3_D6_SPRING_CONSTRAINT_TYPE,
-//	B3_GEAR_CONSTRAINT_TYPE,
-	
+	B3_GPU_POINT2POINT_CONSTRAINT_TYPE = 3,
+	B3_GPU_FIXED_CONSTRAINT_TYPE = 4,
+	//	B3_HINGE_CONSTRAINT_TYPE,
+	//	B3_CONETWIST_CONSTRAINT_TYPE,
+	//	B3_D6_CONSTRAINT_TYPE,
+	//	B3_SLIDER_CONSTRAINT_TYPE,
+	//	B3_CONTACT_CONSTRAINT_TYPE,
+	//	B3_D6_SPRING_CONSTRAINT_TYPE,
+	//	B3_GEAR_CONSTRAINT_TYPE,
+
 	B3_GPU_MAX_CONSTRAINT_TYPE
 };

-
-
-struct b3GpuConstraintInfo2 
+struct b3GpuConstraintInfo2
 {
 	// integrator parameters: frames per second (1/stepsize), default error
 	// reduction parameter (0..1).
-	b3Scalar fps,erp;
+	b3Scalar fps, erp;

 	// for the first and second body, pointers to two (linear and angular)
 	// n*3 jacobian sub matrices, stored by rows. these matrices will have
 	// been initialized to 0 on entry. if the second body is zero then the
 	// J2xx pointers may be 0.
-	b3Scalar *m_J1linearAxis,*m_J1angularAxis,*m_J2linearAxis,*m_J2angularAxis;
+	b3Scalar *m_J1linearAxis, *m_J1angularAxis, *m_J2linearAxis, *m_J2angularAxis;

 	// elements to jump from one row to the next in J's
 	int rowskip;
@@ -58,44 +56,44 @@ struct b3GpuConstraintInfo2
 	// right hand sides of the equation J*v = c + cfm * lambda. cfm is the
 	// "constraint force mixing" vector. c is set to zero on entry, cfm is
 	// set to a constant value (typically very small or zero) value on entry.
-	b3Scalar *m_constraintError,*cfm;
+	b3Scalar *m_constraintError, *cfm;

 	// lo and hi limits for variables (set to -/+ infinity on entry).
-	b3Scalar *m_lowerLimit,*m_upperLimit;
+	b3Scalar *m_lowerLimit, *m_upperLimit;

 	// findex vector for variables. see the LCP solver interface for a
 	// description of what this does. this is set to -1 on entry.
 	// note that the returned indexes are relative to the first index of
 	// the constraint.
-	int *findex;
+	int* findex;
 	// number of solver iterations
 	int m_numIterations;

 	//damping of the velocity
-	b3Scalar	m_damping;
+	b3Scalar m_damping;
 };

-
-B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3GpuGenericConstraint
 {
-	int				m_constraintType;
-	int				m_rbA;
-	int				m_rbB;
-	float			m_breakingImpulseThreshold;
+	int m_constraintType;
+	int m_rbA;
+	int m_rbB;
+	float m_breakingImpulseThreshold;

 	b3Vector3 m_pivotInA;
 	b3Vector3 m_pivotInB;
 	b3Quaternion m_relTargetAB;

-	int	m_flags;
+	int m_flags;
 	int m_uid;
 	int m_padding[2];

-	int	getRigidBodyA() const
+	int getRigidBodyA() const
 	{
 		return m_rbA;
 	}
-	int	getRigidBodyB() const
+	int getRigidBodyB() const
 	{
 		return m_rbB;
 	}
@@ -121,12 +119,10 @@ B3_ATTRIBUTE_ALIGNED16(struct) b3GpuGenericConstraint
 	}

 	///internal method used by the constraint solver, don't use them directly
-	void getInfo1 (unsigned int* info,const b3RigidBodyData* bodies);
+	void getInfo1(unsigned int* info, const b3RigidBodyData* bodies);

 	///internal method used by the constraint solver, don't use them directly
-	void getInfo2 (b3GpuConstraintInfo2* info,  const b3RigidBodyData* bodies);
-
-
+	void getInfo2(b3GpuConstraintInfo2 * info, const b3RigidBodyData* bodies);
 };

-#endif //B3_GPU_GENERIC_CONSTRAINT_H
+#endif  //B3_GPU_GENERIC_CONSTRAINT_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.cpp
--- a/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuJacobiContactSolver.h
@@ -8,7 +8,6 @@
 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Contact4Data.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"

-
 //struct b3InertiaData;
 //b3InertiaData

@@ -21,21 +20,20 @@ struct b3JacobiSolverInfo
 	float m_deltaTime;
 	float m_positionDrift;
 	float m_positionConstraintCoeff;
-	int	m_numIterations;
+	int m_numIterations;

 	b3JacobiSolverInfo()
-		:m_fixedBodyIndex(0),
-		m_deltaTime(1./60.f),
-		m_positionDrift( 0.005f ), 
-		m_positionConstraintCoeff( 0.99f ),
-		m_numIterations(7)
+		: m_fixedBodyIndex(0),
+		  m_deltaTime(1. / 60.f),
+		  m_positionDrift(0.005f),
+		  m_positionConstraintCoeff(0.99f),
+		  m_numIterations(7)
 	{
 	}
 };
 class b3GpuJacobiContactSolver
 {
 protected:
-
 	struct b3GpuJacobiSolverInternalData* m_data;

 	cl_context m_context;
@@ -43,20 +41,16 @@ protected:
 	cl_command_queue m_queue;

 public:
-
 	b3GpuJacobiContactSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
 	virtual ~b3GpuJacobiContactSolver();

-
 	void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
-	void solveGroupHost(b3RigidBodyData* bodies,b3InertiaData* inertias,int numBodies,struct b3Contact4* manifoldPtr, int numManifolds,const b3JacobiSolverInfo& solverInfo);
+	void solveGroupHost(b3RigidBodyData* bodies, b3InertiaData* inertias, int numBodies, struct b3Contact4* manifoldPtr, int numManifolds, const b3JacobiSolverInfo& solverInfo);
 	//void  solveGroupHost(btRigidBodyCL* bodies,b3InertiaData* inertias,int numBodies,btContact4* manifoldPtr, int numManifolds,btTypedConstraint** constraints,int numConstraints,const btJacobiSolverInfo& solverInfo);

 	//b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);

 	//void  solveGroup(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
 	//void  solveGroupMixed(btOpenCLArray<btRigidBodyCL>* bodies,btOpenCLArray<btInertiaCL>* inertias,btOpenCLArray<btContact4>* manifoldPtr,const btJacobiSolverInfo& solverInfo);
-
 };
-#endif //B3_GPU_JACOBI_CONTACT_SOLVER_H
-
+#endif  //B3_GPU_JACOBI_CONTACT_SOLVER_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.cpp
--- a/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhase.h
@@ -9,11 +9,10 @@
 class b3GpuNarrowPhase
 {
 protected:
-
-	struct b3GpuNarrowPhaseInternalData*	m_data;
+	struct b3GpuNarrowPhaseInternalData* m_data;
 	int m_acceleratedCompanionShapeIndex;
 	int m_planeBodyIndex;
-	int	m_static0Index;
+	int m_static0Index;

 	cl_context m_context;
 	cl_device_id m_device;
@@ -23,64 +22,58 @@ protected:
 	int registerConcaveMeshShape(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, b3Collidable& col, const float* scaling);

 public:
-
-	
-
-
 	b3GpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct b3Config& config);

 	virtual ~b3GpuNarrowPhase(void);

-	int		registerSphereShape(float radius);
-	int		registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
+	int registerSphereShape(float radius);
+	int registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);

 	int registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);
 	int registerFace(const b3Vector3& faceNormal, float faceConstant);
-	
-	int	registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices,const float* scaling);
-	
+
+	int registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
+
 	//do they need to be merged?
-	
-	int	registerConvexHullShape(b3ConvexUtility* utilPtr);
-	int	registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);

-	int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
-	void setObjectTransform(const float* position, const float* orientation , int bodyIndex);
+	int registerConvexHullShape(b3ConvexUtility* utilPtr);
+	int registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);

-	void	writeAllBodiesToGpu();
-	void  reset();
-	void	readbackAllBodiesToCpu();
-	bool	getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const;
+	int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax, bool writeToGpu);
+	void setObjectTransform(const float* position, const float* orientation, int bodyIndex);

-	void setObjectTransformCpu(float* position, float* orientation , int bodyIndex);
+	void writeAllBodiesToGpu();
+	void reset();
+	void readbackAllBodiesToCpu();
+	bool getObjectTransformFromCpu(float* position, float* orientation, int bodyIndex) const;
+
+	void setObjectTransformCpu(float* position, float* orientation, int bodyIndex);
 	void setObjectVelocityCpu(float* linVel, float* angVel, int bodyIndex);

-	
 	virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWorldSpace, int numObjects);
-	

-	cl_mem	getBodiesGpu();
+	cl_mem getBodiesGpu();
 	const struct b3RigidBodyData* getBodiesCpu() const;
 	//struct b3RigidBodyData* getBodiesCpu();

-	int	getNumBodiesGpu() const;
+	int getNumBodiesGpu() const;

-	cl_mem	getBodyInertiasGpu();
-	int	getNumBodyInertiasGpu() const;
+	cl_mem getBodyInertiasGpu();
+	int getNumBodyInertiasGpu() const;

-	cl_mem	getCollidablesGpu();
+	cl_mem getCollidablesGpu();
 	const struct b3Collidable* getCollidablesCpu() const;
-	int		getNumCollidablesGpu() const;
+	int getNumCollidablesGpu() const;

 	const struct b3SapAabb* getLocalSpaceAabbsCpu() const;

 	const struct b3Contact4* getContactsCPU() const;

-	cl_mem	getContactsGpu();
-	int	getNumContactsGpu() const;
+	cl_mem getContactsGpu();
+	int getNumContactsGpu() const;
+
+	cl_mem getAabbLocalSpaceBufferGpu();

-	cl_mem	getAabbLocalSpaceBufferGpu();
-	
 	int getNumRigidBodies() const;

 	int allocateCollidable();
@@ -92,18 +85,17 @@ public:
 	b3Collidable& getCollidableCpu(int collidableIndex);
 	const b3Collidable& getCollidableCpu(int collidableIndex) const;

-	const b3GpuNarrowPhaseInternalData*	getInternalData() const
+	const b3GpuNarrowPhaseInternalData* getInternalData() const
 	{
-			return m_data;
+		return m_data;
 	}

-	b3GpuNarrowPhaseInternalData*	getInternalData()
+	b3GpuNarrowPhaseInternalData* getInternalData()
 	{
-			return m_data;
+		return m_data;
 	}

 	const struct b3SapAabb& getLocalSpaceAabb(int collidableIndex) const;
 };

-#endif //B3_GPU_NARROWPHASE_H
-
+#endif  //B3_GPU_NARROWPHASE_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h
@@ -20,57 +20,53 @@
 #include "Bullet3Common/shared/b3Int4.h"
 #include "Bullet3Common/shared/b3Int2.h"

-
 class b3ConvexUtility;

 struct b3GpuNarrowPhaseInternalData
 {
 	b3AlignedObjectArray<b3ConvexUtility*>* m_convexData;
-    
+
 	b3AlignedObjectArray<b3ConvexPolyhedronData> m_convexPolyhedra;
 	b3AlignedObjectArray<b3Vector3> m_uniqueEdges;
 	b3AlignedObjectArray<b3Vector3> m_convexVertices;
 	b3AlignedObjectArray<int> m_convexIndices;
-    
+
 	b3OpenCLArray<b3ConvexPolyhedronData>* m_convexPolyhedraGPU;
 	b3OpenCLArray<b3Vector3>* m_uniqueEdgesGPU;
 	b3OpenCLArray<b3Vector3>* m_convexVerticesGPU;
 	b3OpenCLArray<int>* m_convexIndicesGPU;
-    
-    b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
-    b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
-    b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
-    b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
-    b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
-    
+
+	b3OpenCLArray<b3Vector3>* m_worldVertsB1GPU;
+	b3OpenCLArray<b3Int4>* m_clippingFacesOutGPU;
+	b3OpenCLArray<b3Vector3>* m_worldNormalsAGPU;
+	b3OpenCLArray<b3Vector3>* m_worldVertsA1GPU;
+	b3OpenCLArray<b3Vector3>* m_worldVertsB2GPU;
+
 	b3AlignedObjectArray<b3GpuChildShape> m_cpuChildShapes;
-	b3OpenCLArray<b3GpuChildShape>*	m_gpuChildShapes;
-    
+	b3OpenCLArray<b3GpuChildShape>* m_gpuChildShapes;
+
 	b3AlignedObjectArray<b3GpuFace> m_convexFaces;
 	b3OpenCLArray<b3GpuFace>* m_convexFacesGPU;
-    
-	struct GpuSatCollision*	m_gpuSatCollision;
-	    
-	
-	b3OpenCLArray<b3Int4>*			m_triangleConvexPairs;
-    
-	
+
+	struct GpuSatCollision* m_gpuSatCollision;
+
+	b3OpenCLArray<b3Int4>* m_triangleConvexPairs;
+
 	b3OpenCLArray<b3Contact4>* m_pBufContactBuffersGPU[2];
-	int	m_currentContactBuffer;
+	int m_currentContactBuffer;
 	b3AlignedObjectArray<b3Contact4>* m_pBufContactOutCPU;
-	
-    
+
 	b3AlignedObjectArray<b3RigidBodyData>* m_bodyBufferCPU;
 	b3OpenCLArray<b3RigidBodyData>* m_bodyBufferGPU;
-    
-	b3AlignedObjectArray<b3InertiaData>*	m_inertiaBufferCPU;
-	b3OpenCLArray<b3InertiaData>*	m_inertiaBufferGPU;
-    
+
+	b3AlignedObjectArray<b3InertiaData>* m_inertiaBufferCPU;
+	b3OpenCLArray<b3InertiaData>* m_inertiaBufferGPU;
+
 	int m_numAcceleratedShapes;
 	int m_numAcceleratedRigidBodies;
-    
-	b3AlignedObjectArray<b3Collidable>	m_collidablesCPU;
-	b3OpenCLArray<b3Collidable>*	m_collidablesGPU;
+
+	b3AlignedObjectArray<b3Collidable> m_collidablesCPU;
+	b3OpenCLArray<b3Collidable>* m_collidablesGPU;

 	b3OpenCLArray<b3SapAabb>* m_localShapeAABBGPU;
 	b3AlignedObjectArray<b3SapAabb>* m_localShapeAABBCPU;
@@ -78,18 +74,16 @@ struct b3GpuNarrowPhaseInternalData
 	b3AlignedObjectArray<class b3OptimizedBvh*> m_bvhData;
 	b3AlignedObjectArray<class b3TriangleIndexVertexArray*> m_meshInterfaces;

-	b3AlignedObjectArray<b3QuantizedBvhNode>	m_treeNodesCPU;
-	b3AlignedObjectArray<b3BvhSubtreeInfo>	m_subTreesCPU;
+	b3AlignedObjectArray<b3QuantizedBvhNode> m_treeNodesCPU;
+	b3AlignedObjectArray<b3BvhSubtreeInfo> m_subTreesCPU;

-	b3AlignedObjectArray<b3BvhInfo>	m_bvhInfoCPU;
-	b3OpenCLArray<b3BvhInfo>*			m_bvhInfoGPU;
-	
-	b3OpenCLArray<b3QuantizedBvhNode>*	m_treeNodesGPU;
-	b3OpenCLArray<b3BvhSubtreeInfo>*	m_subTreesGPU;
-	
+	b3AlignedObjectArray<b3BvhInfo> m_bvhInfoCPU;
+	b3OpenCLArray<b3BvhInfo>* m_bvhInfoGPU;

-	b3Config	m_config;
-    
+	b3OpenCLArray<b3QuantizedBvhNode>* m_treeNodesGPU;
+	b3OpenCLArray<b3BvhSubtreeInfo>* m_subTreesGPU;
+
+	b3Config m_config;
 };

-#endif //B3_GPU_NARROWPHASE_INTERNAL_DATA_H
+#endif  //B3_GPU_NARROWPHASE_INTERNAL_DATA_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.cpp
--- a/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuPgsConstraintSolver.h
@@ -19,7 +19,6 @@ subject to the following restrictions:
 struct b3Contact4;
 struct b3ContactPoint;

-
 class b3Dispatcher;

 #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
@@ -38,41 +37,40 @@ class b3GpuPgsConstraintSolver
 protected:
 	int m_staticIdx;
 	struct b3GpuPgsJacobiSolverInternalData* m_gpuData;
-	protected:
-	b3AlignedObjectArray<b3GpuSolverBody>      m_tmpSolverBodyPool;
-	b3GpuConstraintArray			m_tmpSolverContactConstraintPool;
-	b3GpuConstraintArray			m_tmpSolverNonContactConstraintPool;
-	b3GpuConstraintArray			m_tmpSolverContactFrictionConstraintPool;
-	b3GpuConstraintArray			m_tmpSolverContactRollingFrictionConstraintPool;
+
+protected:
+	b3AlignedObjectArray<b3GpuSolverBody> m_tmpSolverBodyPool;
+	b3GpuConstraintArray m_tmpSolverContactConstraintPool;
+	b3GpuConstraintArray m_tmpSolverNonContactConstraintPool;
+	b3GpuConstraintArray m_tmpSolverContactFrictionConstraintPool;
+	b3GpuConstraintArray m_tmpSolverContactRollingFrictionConstraintPool;

 	b3AlignedObjectArray<unsigned int> m_tmpConstraintSizesPool;
-	

-	bool						m_usePgs;
-	void						averageVelocities();
+	bool m_usePgs;
+	void averageVelocities();

-	int							m_maxOverrideNumSolverIterations;
+	int m_maxOverrideNumSolverIterations;

-	int							m_numSplitImpulseRecoveries;
+	int m_numSplitImpulseRecoveries;

-//	int	getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
-	void	initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb);
+	//	int	getOrInitSolverBody(int bodyIndex, b3RigidBodyData* bodies,b3InertiaData* inertias);
+	void initSolverBody(int bodyIndex, b3GpuSolverBody* solverBody, b3RigidBodyData* rb);

 public:
-	b3GpuPgsConstraintSolver (cl_context ctx, cl_device_id device, cl_command_queue queue,bool usePgs);
-	virtual~b3GpuPgsConstraintSolver ();
+	b3GpuPgsConstraintSolver(cl_context ctx, cl_device_id device, cl_command_queue queue, bool usePgs);
+	virtual ~b3GpuPgsConstraintSolver();

-	virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1,int numConstraints,const b3ContactSolverInfo& infoGlobal);
-	virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
-	b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias,int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
+	virtual b3Scalar solveGroupCacheFriendlyIterations(b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints1, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+	virtual b3Scalar solveGroupCacheFriendlySetup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+	b3Scalar solveGroupCacheFriendlyFinish(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);

+	b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints, int numConstraints, const b3ContactSolverInfo& infoGlobal);
+	void solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias,
+					 int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints);

-	b3Scalar solveGroup(b3OpenCLArray<b3RigidBodyData>* gpuBodies,b3OpenCLArray<b3InertiaData>* gpuInertias, int numBodies,b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints,int numConstraints,const b3ContactSolverInfo& infoGlobal);
-	void	solveJoints(int numBodies, b3OpenCLArray<b3RigidBodyData>* gpuBodies, b3OpenCLArray<b3InertiaData>* gpuInertias, 
-				int numConstraints, b3OpenCLArray<b3GpuGenericConstraint>* gpuConstraints);
-
-	int sortConstraintByBatch3( struct b3BatchConstraint* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies);
-	void	recomputeBatches();
+	int sortConstraintByBatch3(struct b3BatchConstraint* cs, int numConstraints, int simdWidth, int staticIdx, int numBodies);
+	void recomputeBatches();
 };

-#endif //B3_GPU_PGS_CONSTRAINT_SOLVER_H
+#endif  //B3_GPU_PGS_CONSTRAINT_SOLVER_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.cpp
--- a/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuPgsContactSolver.h
@@ -11,33 +11,27 @@
 class b3GpuPgsContactSolver
 {
 protected:
-
 	int m_debugOutput;

-	struct b3GpuBatchingPgsSolverInternalData*		m_data;
+	struct b3GpuBatchingPgsSolverInternalData* m_data;

-	void batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
-	
-	inline int sortConstraintByBatch( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
-	inline int sortConstraintByBatch2( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
-	inline int sortConstraintByBatch3( b3Contact4* cs, int n, int simdWidth , int staticIdx, int numBodies, int* batchSizes);
-	
+	void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx);

-	
-	void solveContactConstraintBatchSizes(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
-			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes);
+	inline int sortConstraintByBatch(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
+	inline int sortConstraintByBatch2(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies);
+	inline int sortConstraintByBatch3(b3Contact4* cs, int n, int simdWidth, int staticIdx, int numBodies, int* batchSizes);

-		void solveContactConstraint(  const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf, 
-			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);//const b3OpenCLArray<int>* gpuBatchSizes);
+	void solveContactConstraintBatchSizes(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
+										  b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);  //const b3OpenCLArray<int>* gpuBatchSizes);
+
+	void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
+								b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, int numIterations, const b3AlignedObjectArray<int>* batchSizes);  //const b3OpenCLArray<int>* gpuBatchSizes);

 public:
-	
-	b3GpuPgsContactSolver(cl_context ctx,cl_device_id device, cl_command_queue  q,int pairCapacity);
+	b3GpuPgsContactSolver(cl_context ctx, cl_device_id device, cl_command_queue q, int pairCapacity);
 	virtual ~b3GpuPgsContactSolver();

 	void solveContacts(int numBodies, cl_mem bodyBuf, cl_mem inertiaBuf, int numContacts, cl_mem contactBuf, const struct b3Config& config, int static0Index);
-
 };

-#endif //B3_GPU_BATCHING_PGS_SOLVER_H
-
+#endif  //B3_GPU_BATCHING_PGS_SOLVER_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
@@ -47,7 +47,7 @@ bool gClearPairsOnGpu = true;
 #define TEST_OTHER_GPU_SOLVER 1
 #ifdef TEST_OTHER_GPU_SOLVER
 #include "b3GpuJacobiContactSolver.h"
-#endif //TEST_OTHER_GPU_SOLVER
+#endif  //TEST_OTHER_GPU_SOLVER

 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3RigidBodyData.h"
 #include "Bullet3Collision/NarrowPhaseCollision/b3Contact4.h"
@@ -59,73 +59,68 @@ bool gClearPairsOnGpu = true;
 #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"
 #include "Bullet3OpenCL/Raycast/b3GpuRaycast.h"

-	
 #include "Bullet3Dynamics/shared/b3IntegrateTransforms.h"
 #include "Bullet3OpenCL/RigidBody/b3GpuNarrowPhaseInternalData.h"

-b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue  q,class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap , struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
+b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config)
 {
 	m_data = new b3GpuRigidBodyPipelineInternalData;
-	m_data->m_constraintUid=0;
+	m_data->m_constraintUid = 0;
 	m_data->m_config = config;
 	m_data->m_context = ctx;
 	m_data->m_device = device;
 	m_data->m_queue = q;

-	m_data->m_solver = new b3PgsJacobiSolver(true);//new b3PgsJacobiSolver(true);
-	m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx,device,q,true);//new b3PgsJacobiSolver(true);
-	
-	m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx,q,config.m_maxConvexBodies);
-	m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx,q,config.m_maxBroadphasePairs);
+	m_data->m_solver = new b3PgsJacobiSolver(true);                            //new b3PgsJacobiSolver(true);
+	m_data->m_gpuSolver = new b3GpuPgsConstraintSolver(ctx, device, q, true);  //new b3PgsJacobiSolver(true);

-	m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx,q);
+	m_data->m_allAabbsGPU = new b3OpenCLArray<b3SapAabb>(ctx, q, config.m_maxConvexBodies);
+	m_data->m_overlappingPairsGPU = new b3OpenCLArray<b3BroadphasePair>(ctx, q, config.m_maxBroadphasePairs);
+
+	m_data->m_gpuConstraints = new b3OpenCLArray<b3GpuGenericConstraint>(ctx, q);
 #ifdef TEST_OTHER_GPU_SOLVER
-	m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx,device,q,config.m_maxBroadphasePairs);	
-#endif //	TEST_OTHER_GPU_SOLVER
-	
-	m_data->m_solver2 = new b3GpuPgsContactSolver(ctx,device,q,config.m_maxBroadphasePairs);
+	m_data->m_solver3 = new b3GpuJacobiContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
+#endif  //	TEST_OTHER_GPU_SOLVER

-	m_data->m_raycaster = new b3GpuRaycast(ctx,device,q);
+	m_data->m_solver2 = new b3GpuPgsContactSolver(ctx, device, q, config.m_maxBroadphasePairs);
+
+	m_data->m_raycaster = new b3GpuRaycast(ctx, device, q);

-	
 	m_data->m_broadphaseDbvt = broadphaseDbvt;
 	m_data->m_broadphaseSap = broadphaseSap;
 	m_data->m_narrowphase = narrowphase;
-	m_data->m_gravity.setValue(0.f,-9.8f,0.f);
+	m_data->m_gravity.setValue(0.f, -9.8f, 0.f);

-	cl_int errNum=0;
+	cl_int errNum = 0;

 	{
-		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,integrateKernelCL,&errNum,"",B3_RIGIDBODY_INTEGRATE_PATH);
-		b3Assert(errNum==CL_SUCCESS);
-		m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,integrateKernelCL, "integrateTransformsKernel",&errNum,prog);
-		b3Assert(errNum==CL_SUCCESS);
+		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, integrateKernelCL, &errNum, "", B3_RIGIDBODY_INTEGRATE_PATH);
+		b3Assert(errNum == CL_SUCCESS);
+		m_data->m_integrateTransformsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, integrateKernelCL, "integrateTransformsKernel", &errNum, prog);
+		b3Assert(errNum == CL_SUCCESS);
 		clReleaseProgram(prog);
 	}
 	{
-		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,updateAabbsKernelCL,&errNum,"",B3_RIGIDBODY_UPDATEAABB_PATH);
-		b3Assert(errNum==CL_SUCCESS);
-		m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "initializeGpuAabbsFull",&errNum,prog);
-		b3Assert(errNum==CL_SUCCESS);
+		cl_program prog = b3OpenCLUtils::compileCLProgramFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, &errNum, "", B3_RIGIDBODY_UPDATEAABB_PATH);
+		b3Assert(errNum == CL_SUCCESS);
+		m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "initializeGpuAabbsFull", &errNum, prog);
+		b3Assert(errNum == CL_SUCCESS);

-
-		m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "clearOverlappingPairsKernel",&errNum,prog);
-		b3Assert(errNum==CL_SUCCESS);
+		m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device, updateAabbsKernelCL, "clearOverlappingPairsKernel", &errNum, prog);
+		b3Assert(errNum == CL_SUCCESS);

 		clReleaseProgram(prog);
 	}
-
-
 }

 b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()
 {
 	if (m_data->m_integrateTransformsKernel)
 		clReleaseKernel(m_data->m_integrateTransformsKernel);
-	
+
 	if (m_data->m_updateAabbsKernel)
 		clReleaseKernel(m_data->m_updateAabbsKernel);
-	
+
 	if (m_data->m_clearOverlappingPairsKernel)
 		clReleaseKernel(m_data->m_clearOverlappingPairsKernel);
 	delete m_data->m_raycaster;
@@ -136,15 +131,14 @@ b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()

 #ifdef TEST_OTHER_GPU_SOLVER
 	delete m_data->m_solver3;
-#endif //TEST_OTHER_GPU_SOLVER
-	
+#endif  //TEST_OTHER_GPU_SOLVER
+
 	delete m_data->m_solver2;
-	
-	
+
 	delete m_data;
 }

-void	b3GpuRigidBodyPipeline::reset()
+void b3GpuRigidBodyPipeline::reset()
 {
 	m_data->m_gpuConstraints->resize(0);
 	m_data->m_cpuConstraints.resize(0);
@@ -152,30 +146,28 @@ void	b3GpuRigidBodyPipeline::reset()
 	m_data->m_allAabbsCPU.resize(0);
 }

-void	b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
+void b3GpuRigidBodyPipeline::addConstraint(b3TypedConstraint* constraint)
 {
 	m_data->m_joints.push_back(constraint);
 }

-void	b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint)
+void b3GpuRigidBodyPipeline::removeConstraint(b3TypedConstraint* constraint)
 {
 	m_data->m_joints.remove(constraint);
 }

-
-
-void  b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
+void b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
 {
 	m_data->m_gpuSolver->recomputeBatches();
 	//slow linear search
 	m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
 	//remove
-	for (int i=0;i<m_data->m_cpuConstraints.size();i++)
+	for (int i = 0; i < m_data->m_cpuConstraints.size(); i++)
 	{
 		if (m_data->m_cpuConstraints[i].m_uid == uid)
 		{
 			//m_data->m_cpuConstraints.remove(m_data->m_cpuConstraints[i]);
-			m_data->m_cpuConstraints.swap(i,m_data->m_cpuConstraints.size()-1);
+			m_data->m_cpuConstraints.swap(i, m_data->m_cpuConstraints.size() - 1);
 			m_data->m_cpuConstraints.pop_back();

 			break;
@@ -185,13 +177,13 @@ void  b3GpuRigidBodyPipeline::removeConstraintByUid(int uid)
 	if (m_data->m_cpuConstraints.size())
 	{
 		m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
-	} else
+	}
+	else
 	{
 		m_data->m_gpuConstraints->resize(0);
 	}
-
 }
-int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold)
+int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold)
 {
 	m_data->m_gpuSolver->recomputeBatches();
 	b3GpuGenericConstraint c;
@@ -200,14 +192,14 @@ int b3GpuRigidBodyPipeline::createPoint2PointConstraint(int bodyA, int bodyB, co
 	c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
 	c.m_rbA = bodyA;
 	c.m_rbB = bodyB;
-	c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]);
-	c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]);
+	c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
+	c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
 	c.m_breakingImpulseThreshold = breakingThreshold;
 	c.m_constraintType = B3_GPU_POINT2POINT_CONSTRAINT_TYPE;
 	m_data->m_cpuConstraints.push_back(c);
 	return c.m_uid;
 }
-int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB,float breakingThreshold)
+int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold)
 {
 	m_data->m_gpuSolver->recomputeBatches();
 	b3GpuGenericConstraint c;
@@ -216,9 +208,9 @@ int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const fl
 	c.m_flags = B3_CONSTRAINT_FLAG_ENABLED;
 	c.m_rbA = bodyA;
 	c.m_rbB = bodyB;
-	c.m_pivotInA.setValue(pivotInA[0],pivotInA[1],pivotInA[2]);
-	c.m_pivotInB.setValue(pivotInB[0],pivotInB[1],pivotInB[2]);
-	c.m_relTargetAB.setValue(relTargetAB[0],relTargetAB[1],relTargetAB[2],relTargetAB[3]);
+	c.m_pivotInA.setValue(pivotInA[0], pivotInA[1], pivotInA[2]);
+	c.m_pivotInB.setValue(pivotInB[0], pivotInB[1], pivotInB[2]);
+	c.m_relTargetAB.setValue(relTargetAB[0], relTargetAB[1], relTargetAB[2], relTargetAB[3]);
 	c.m_breakingImpulseThreshold = breakingThreshold;
 	c.m_constraintType = B3_GPU_FIXED_CONSTRAINT_TYPE;

@@ -226,31 +218,28 @@ int b3GpuRigidBodyPipeline::createFixedConstraint(int bodyA, int bodyB, const fl
 	return c.m_uid;
 }

-
-void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
+void b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 {
-
 	//update worldspace AABBs from local AABB/worldtransform
 	{
 		B3_PROFILE("setupGpuAabbs");
 		setupGpuAabbsFull();
 	}

-	int numPairs =0;
+	int numPairs = 0;

 	//compute overlapping pairs
 	{
-
 		if (gUseDbvt)
 		{
 			{
 				B3_PROFILE("setAabb");
 				m_data->m_allAabbsGPU->copyToHost(m_data->m_allAabbsCPU);
-				for (int i=0;i<m_data->m_allAabbsCPU.size();i++)
+				for (int i = 0; i < m_data->m_allAabbsCPU.size(); i++)
 				{
-					b3Vector3 aabbMin=b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0],m_data->m_allAabbsCPU[i].m_min[1],m_data->m_allAabbsCPU[i].m_min[2]);
-					b3Vector3 aabbMax=b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0],m_data->m_allAabbsCPU[i].m_max[1],m_data->m_allAabbsCPU[i].m_max[2]);
-					m_data->m_broadphaseDbvt->setAabb(i,aabbMin,aabbMax,0);
+					b3Vector3 aabbMin = b3MakeVector3(m_data->m_allAabbsCPU[i].m_min[0], m_data->m_allAabbsCPU[i].m_min[1], m_data->m_allAabbsCPU[i].m_min[2]);
+					b3Vector3 aabbMax = b3MakeVector3(m_data->m_allAabbsCPU[i].m_max[0], m_data->m_allAabbsCPU[i].m_max[1], m_data->m_allAabbsCPU[i].m_max[2]);
+					m_data->m_broadphaseDbvt->setAabb(i, aabbMin, aabbMax, 0);
 				}
 			}

@@ -259,13 +248,14 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 				m_data->m_broadphaseDbvt->calculateOverlappingPairs();
 			}
 			numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs();
-
-		} else
+		}
+		else
 		{
 			if (gUseCalculateOverlappingPairsHost)
 			{
 				m_data->m_broadphaseSap->calculateOverlappingPairsHost(m_data->m_config.m_maxBroadphasePairs);
-			} else
+			}
+			else
 			{
 				m_data->m_broadphaseSap->calculateOverlappingPairs(m_data->m_config.m_maxBroadphasePairs);
 			}
@@ -274,24 +264,24 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 	}

 	//compute contact points
-//	printf("numPairs=%d\n",numPairs);
-	
-	int numContacts  = 0;
+	//	printf("numPairs=%d\n",numPairs);

+	int numContacts = 0;

 	int numBodies = m_data->m_narrowphase->getNumRigidBodies();

 	if (numPairs)
 	{
-		cl_mem pairs =0;
-		cl_mem aabbsWS =0;
+		cl_mem pairs = 0;
+		cl_mem aabbsWS = 0;
 		if (gUseDbvt)
 		{
 			B3_PROFILE("m_overlappingPairsGPU->copyFromHost");
 			m_data->m_overlappingPairsGPU->copyFromHost(m_data->m_broadphaseDbvt->getOverlappingPairCache()->getOverlappingPairArray());
 			pairs = m_data->m_overlappingPairsGPU->getBufferCL();
 			aabbsWS = m_data->m_allAabbsGPU->getBufferCL();
-		} else
+		}
+		else
 		{
 			pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer();
 			aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
@@ -302,31 +292,27 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 		//mark the contacts for each pair as 'unused'
 		if (numPairs)
 		{
-			b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context,m_data->m_queue);
-			gpuPairs.setFromOpenCLBuffer(pairs,numPairs);
+			b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context, m_data->m_queue);
+			gpuPairs.setFromOpenCLBuffer(pairs, numPairs);

 			if (gClearPairsOnGpu)
 			{
-				
-
 				//b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging
 				//gpuPairs.copyToHost(hostPairs);

-				b3LauncherCL launcher(m_data->m_queue,m_data->m_clearOverlappingPairsKernel,"clearOverlappingPairsKernel");
+				b3LauncherCL launcher(m_data->m_queue, m_data->m_clearOverlappingPairsKernel, "clearOverlappingPairsKernel");
 				launcher.setBuffer(pairs);
 				launcher.setConst(numPairs);
 				launcher.launch1D(numPairs);

-
 				//gpuPairs.copyToHost(hostPairs);
-			
-
-			} else
+			}
+			else
 			{
 				b3AlignedObjectArray<b3BroadphasePair> hostPairs;
 				gpuPairs.copyToHost(hostPairs);

-				for (int i=0;i<hostPairs.size();i++)
+				for (int i = 0; i < hostPairs.size(); i++)
 				{
 					hostPairs[i].z = 0xffffffff;
 				}
@@ -335,7 +321,7 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 			}
 		}

-		m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies);
+		m_data->m_narrowphase->computeContacts(pairs, numPairs, aabbsWS, numBodies);
 		numContacts = m_data->m_narrowphase->getNumContactsGpu();

 		if (gUseDbvt)
@@ -347,56 +333,54 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 		if (gDumpContactStats && numContacts)
 		{
 			m_data->m_narrowphase->getContactsGpu();
-			
+
 			printf("numContacts = %d\n", numContacts);

-			int totalPoints  = 0;
+			int totalPoints = 0;
 			const b3Contact4* contacts = m_data->m_narrowphase->getContactsCPU();

-			for (int i=0;i<numContacts;i++)
+			for (int i = 0; i < numContacts; i++)
 			{
 				totalPoints += contacts->getNPoints();
 			}
-			printf("totalPoints=%d\n",totalPoints);
-
+			printf("totalPoints=%d\n", totalPoints);
 		}
 	}
-	

 	//convert contact points to contact constraints
-	
+
 	//solve constraints

-	b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context,m_data->m_queue,0,true);
-	gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(),m_data->m_narrowphase->getNumRigidBodies());
-	b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context,m_data->m_queue,0,true);
-	gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(),m_data->m_narrowphase->getNumRigidBodies());
-	b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context,m_data->m_queue,0,true);
-	gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(),m_data->m_narrowphase->getNumContactsGpu());
+	b3OpenCLArray<b3RigidBodyData> gpuBodies(m_data->m_context, m_data->m_queue, 0, true);
+	gpuBodies.setFromOpenCLBuffer(m_data->m_narrowphase->getBodiesGpu(), m_data->m_narrowphase->getNumRigidBodies());
+	b3OpenCLArray<b3InertiaData> gpuInertias(m_data->m_context, m_data->m_queue, 0, true);
+	gpuInertias.setFromOpenCLBuffer(m_data->m_narrowphase->getBodyInertiasGpu(), m_data->m_narrowphase->getNumRigidBodies());
+	b3OpenCLArray<b3Contact4> gpuContacts(m_data->m_context, m_data->m_queue, 0, true);
+	gpuContacts.setFromOpenCLBuffer(m_data->m_narrowphase->getContactsGpu(), m_data->m_narrowphase->getNumContactsGpu());

-	int numJoints =  m_data->m_joints.size() ?  m_data->m_joints.size() : m_data->m_cpuConstraints.size();
+	int numJoints = m_data->m_joints.size() ? m_data->m_joints.size() : m_data->m_cpuConstraints.size();
 	if (useBullet2CpuSolver && numJoints)
 	{
-
-	//	b3AlignedObjectArray<b3Contact4> hostContacts;
+		//	b3AlignedObjectArray<b3Contact4> hostContacts;
 		//gpuContacts.copyToHost(hostContacts);
 		{
-			bool useGpu = m_data->m_joints.size()==0;
+			bool useGpu = m_data->m_joints.size() == 0;

-//			b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
+			//			b3Contact4* contacts = numContacts? &hostContacts[0]: 0;
 			//m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,contacts,numJoints, joints);
 			if (useGpu)
 			{
-				m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(),&gpuBodies,&gpuInertias,numJoints, m_data->m_gpuConstraints);
-			} else
+				m_data->m_gpuSolver->solveJoints(m_data->m_narrowphase->getNumRigidBodies(), &gpuBodies, &gpuInertias, numJoints, m_data->m_gpuConstraints);
+			}
+			else
 			{
 				b3AlignedObjectArray<b3RigidBodyData> hostBodies;
 				gpuBodies.copyToHost(hostBodies);
 				b3AlignedObjectArray<b3InertiaData> hostInertias;
 				gpuInertias.copyToHost(hostInertias);

-				b3TypedConstraint** joints = numJoints? &m_data->m_joints[0] : 0;
-				m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(),&hostBodies[0],&hostInertias[0],0,0,numJoints, joints);
+				b3TypedConstraint** joints = numJoints ? &m_data->m_joints[0] : 0;
+				m_data->m_solver->solveContacts(m_data->m_narrowphase->getNumRigidBodies(), &hostBodies[0], &hostInertias[0], 0, 0, numJoints, joints);
 				gpuBodies.copyFromHost(hostBodies);
 			}
 		}
@@ -404,22 +388,20 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)

 	if (numContacts)
 	{
-
 #ifdef TEST_OTHER_GPU_SOLVER
-		
+
 		if (gUseJacobi)
 		{
 			bool useGpu = true;
 			if (useGpu)
 			{
-
 				bool forceHost = false;
 				if (forceHost)
 				{
 					b3AlignedObjectArray<b3RigidBodyData> hostBodies;
 					b3AlignedObjectArray<b3InertiaData> hostInertias;
 					b3AlignedObjectArray<b3Contact4> hostContacts;
-				
+
 					{
 						B3_PROFILE("copyToHost");
 						gpuBodies.copyToHost(hostBodies);
@@ -429,25 +411,24 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)

 					{
 						b3JacobiSolverInfo solverInfo;
-						m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(),&hostContacts[0],hostContacts.size(),solverInfo);
-
-						
+						m_data->m_solver3->solveGroupHost(&hostBodies[0], &hostInertias[0], hostBodies.size(), &hostContacts[0], hostContacts.size(), solverInfo);
 					}
 					{
 						B3_PROFILE("copyFromHost");
 						gpuBodies.copyFromHost(hostBodies);
 					}
-				} else
-
+				}
+				else

 				{
 					int static0Index = m_data->m_narrowphase->getStatic0Index();
 					b3JacobiSolverInfo solverInfo;
 					//m_data->m_solver3->solveContacts(    >solveGroup(&gpuBodies, &gpuInertias, &gpuContacts,solverInfo);
 					//m_data->m_solver3->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(),&hostBodies[0],&hostInertias[0],numContacts,&hostContacts[0]);
-					m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index);
+					m_data->m_solver3->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
 				}
-			} else
+			}
+			else
 			{
 				b3AlignedObjectArray<b3RigidBodyData> hostBodies;
 				gpuBodies.copyToHost(hostBodies);
@@ -460,17 +441,15 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 				}
 				gpuBodies.copyFromHost(hostBodies);
 			}
-		
-		} else
-#endif //TEST_OTHER_GPU_SOLVER
+		}
+		else
+#endif  //TEST_OTHER_GPU_SOLVER
 		{
-			
 			int static0Index = m_data->m_narrowphase->getStatic0Index();
-			m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(),gpuInertias.getBufferCL(),numContacts, gpuContacts.getBufferCL(),m_data->m_config, static0Index);
-			
+			m_data->m_solver2->solveContacts(numBodies, gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL(), m_data->m_config, static0Index);
+
 			//m_data->m_solver4->solveContacts(m_data->m_narrowphase->getNumBodiesGpu(), gpuBodies.getBufferCL(), gpuInertias.getBufferCL(), numContacts, gpuContacts.getBufferCL());
-			
-			
+
 			/*m_data->m_solver3->solveContactConstraintHost(
 			(b3OpenCLArray<RigidBodyBase::Body>*)&gpuBodies,
 			(b3OpenCLArray<RigidBodyBase::Inertia>*)&gpuInertias,
@@ -481,11 +460,9 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 	}

 	integrate(deltaTime);
-
 }

-
-void	b3GpuRigidBodyPipeline::integrate(float timeStep)
+void b3GpuRigidBodyPipeline::integrate(float timeStep)
 {
 	//integrate
 	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
@@ -493,24 +470,25 @@ void	b3GpuRigidBodyPipeline::integrate(float timeStep)

 	if (gIntegrateOnCpu)
 	{
-		if(numBodies)
+		if (numBodies)
 		{
-			b3GpuNarrowPhaseInternalData*	npData = m_data->m_narrowphase->getInternalData();
+			b3GpuNarrowPhaseInternalData* npData = m_data->m_narrowphase->getInternalData();
 			npData->m_bodyBufferGPU->copyToHost(*npData->m_bodyBufferCPU);

 			b3RigidBodyData_t* bodies = &npData->m_bodyBufferCPU->at(0);

-			for (int nodeID=0;nodeID<numBodies;nodeID++)
+			for (int nodeID = 0; nodeID < numBodies; nodeID++)
 			{
-				integrateSingleTransform( bodies,nodeID, timeStep, angularDamp, m_data->m_gravity);
+				integrateSingleTransform(bodies, nodeID, timeStep, angularDamp, m_data->m_gravity);
 			}
 			npData->m_bodyBufferGPU->copyFromHost(*npData->m_bodyBufferCPU);
 		}
-	} else
+	}
+	else
 	{
-		b3LauncherCL launcher(m_data->m_queue,m_data->m_integrateTransformsKernel,"m_integrateTransformsKernel");
+		b3LauncherCL launcher(m_data->m_queue, m_data->m_integrateTransformsKernel, "m_integrateTransformsKernel");
 		launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu());
-		
+
 		launcher.setConst(numBodies);
 		launcher.setConst(timeStep);
 		launcher.setConst(angularDamp);
@@ -519,12 +497,9 @@ void	b3GpuRigidBodyPipeline::integrate(float timeStep)
 	}
 }

-
-
-
-void	b3GpuRigidBodyPipeline::setupGpuAabbsFull()
+void b3GpuRigidBodyPipeline::setupGpuAabbsFull()
 {
-	cl_int ciErrNum=0;
+	cl_int ciErrNum = 0;

 	int numBodies = m_data->m_narrowphase->getNumRigidBodies();
 	if (!numBodies)
@@ -532,34 +507,35 @@ void	b3GpuRigidBodyPipeline::setupGpuAabbsFull()

 	if (gCalcWorldSpaceAabbOnCpu)
 	{
-		
 		if (numBodies)
 		{
 			if (gUseDbvt)
 			{
 				m_data->m_allAabbsCPU.resize(numBodies);
 				m_data->m_narrowphase->readbackAllBodiesToCpu();
-				for (int i=0;i<numBodies;i++)
+				for (int i = 0; i < numBodies; i++)
 				{
-					b3ComputeWorldAabb(  i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_allAabbsCPU[0]);
+					b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_allAabbsCPU[0]);
 				}
 				m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
-			} else
+			}
+			else
 			{
 				m_data->m_broadphaseSap->getAllAabbsCPU().resize(numBodies);
 				m_data->m_narrowphase->readbackAllBodiesToCpu();
-				for (int i=0;i<numBodies;i++)
+				for (int i = 0; i < numBodies; i++)
 				{
-					b3ComputeWorldAabb(  i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(),&m_data->m_broadphaseSap->getAllAabbsCPU()[0]);
+					b3ComputeWorldAabb(i, m_data->m_narrowphase->getBodiesCpu(), m_data->m_narrowphase->getCollidablesCpu(), m_data->m_narrowphase->getLocalSpaceAabbsCpu(), &m_data->m_broadphaseSap->getAllAabbsCPU()[0]);
 				}
 				m_data->m_broadphaseSap->getAllAabbsGPU().copyFromHost(m_data->m_broadphaseSap->getAllAabbsCPU());
 				//m_data->m_broadphaseSap->writeAabbsToGpu();
 			}
 		}
-	} else
+	}
+	else
 	{
 		//__kernel void initializeGpuAabbsFull(  const int numNodes, __global Body* gBodies,__global Collidable* collidables, __global b3AABBCL* plocalShapeAABB, __global b3AABBCL* pAABB)
-		b3LauncherCL launcher(m_data->m_queue,m_data->m_updateAabbsKernel,"m_updateAabbsKernel");
+		b3LauncherCL launcher(m_data->m_queue, m_data->m_updateAabbsKernel, "m_updateAabbsKernel");
 		launcher.setConst(numBodies);
 		cl_mem bodies = m_data->m_narrowphase->getBodiesGpu();
 		launcher.setBuffer(bodies);
@@ -568,17 +544,18 @@ void	b3GpuRigidBodyPipeline::setupGpuAabbsFull()
 		cl_mem localAabbs = m_data->m_narrowphase->getAabbLocalSpaceBufferGpu();
 		launcher.setBuffer(localAabbs);

-		cl_mem worldAabbs =0;
+		cl_mem worldAabbs = 0;
 		if (gUseDbvt)
 		{
 			worldAabbs = m_data->m_allAabbsGPU->getBufferCL();
-		} else
+		}
+		else
 		{
 			worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS();
 		}
 		launcher.setBuffer(worldAabbs);
 		launcher.launch1D(numBodies);
-	
+
 		oclCHECKERROR(ciErrNum, CL_SUCCESS);
 	}

@@ -595,78 +572,68 @@ void	b3GpuRigidBodyPipeline::setupGpuAabbsFull()

 	};
 	*/
-
-	
-
-
-
 }

-
-
-cl_mem	b3GpuRigidBodyPipeline::getBodyBuffer()
+cl_mem b3GpuRigidBodyPipeline::getBodyBuffer()
 {
 	return m_data->m_narrowphase->getBodiesGpu();
 }

-int	b3GpuRigidBodyPipeline::getNumBodies() const
+int b3GpuRigidBodyPipeline::getNumBodies() const
 {
 	return m_data->m_narrowphase->getNumRigidBodies();
 }

-void	b3GpuRigidBodyPipeline::setGravity(const float* grav)
+void b3GpuRigidBodyPipeline::setGravity(const float* grav)
 {
-	m_data->m_gravity.setValue(grav[0],grav[1],grav[2]);
+	m_data->m_gravity.setValue(grav[0], grav[1], grav[2]);
 }

-void 		b3GpuRigidBodyPipeline::copyConstraintsToHost()
+void b3GpuRigidBodyPipeline::copyConstraintsToHost()
 {
 	m_data->m_gpuConstraints->copyToHost(m_data->m_cpuConstraints);
 }

-void 		b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
+void b3GpuRigidBodyPipeline::writeAllInstancesToGpu()
 {
 	m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
 	m_data->m_gpuConstraints->copyFromHost(m_data->m_cpuConstraints);
 }

-
-int		b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
+int b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex, bool writeInstanceToGpu)
 {
-	
-	b3Vector3 aabbMin=b3MakeVector3(0,0,0),aabbMax=b3MakeVector3(0,0,0);
+	b3Vector3 aabbMin = b3MakeVector3(0, 0, 0), aabbMax = b3MakeVector3(0, 0, 0);

-	
-	if (collidableIndex>=0)
+	if (collidableIndex >= 0)
 	{
 		b3SapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex);
-		b3Vector3 localAabbMin=b3MakeVector3(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]);
-		b3Vector3 localAabbMax=b3MakeVector3(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]);
-		
+		b3Vector3 localAabbMin = b3MakeVector3(localAabb.m_min[0], localAabb.m_min[1], localAabb.m_min[2]);
+		b3Vector3 localAabbMax = b3MakeVector3(localAabb.m_max[0], localAabb.m_max[1], localAabb.m_max[2]);
+
 		b3Scalar margin = 0.01f;
 		b3Transform t;
 		t.setIdentity();
-		t.setOrigin(b3MakeVector3(position[0],position[1],position[2]));
-		t.setRotation(b3Quaternion(orientation[0],orientation[1],orientation[2],orientation[3]));
-		b3TransformAabb(localAabbMin,localAabbMax, margin,t,aabbMin,aabbMax);
-	} else
+		t.setOrigin(b3MakeVector3(position[0], position[1], position[2]));
+		t.setRotation(b3Quaternion(orientation[0], orientation[1], orientation[2], orientation[3]));
+		b3TransformAabb(localAabbMin, localAabbMax, margin, t, aabbMin, aabbMax);
+	}
+	else
 	{
 		b3Error("registerPhysicsInstance using invalid collidableIndex\n");
 		return -1;
 	}
-			
-	
+
 	bool writeToGpu = false;
 	int bodyIndex = m_data->m_narrowphase->getNumRigidBodies();
-	bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex,mass,position,orientation,&aabbMin.getX(),&aabbMax.getX(),writeToGpu);
+	bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex, mass, position, orientation, &aabbMin.getX(), &aabbMax.getX(), writeToGpu);

-	if (bodyIndex>=0)
+	if (bodyIndex >= 0)
 	{
 		if (gUseDbvt)
 		{
-			m_data->m_broadphaseDbvt->createProxy(aabbMin,aabbMax,bodyIndex,0,1,1);
+			m_data->m_broadphaseDbvt->createProxy(aabbMin, aabbMax, bodyIndex, 0, 1, 1);
 			b3SapAabb aabb;
-			for (int i=0;i<3;i++)
+			for (int i = 0; i < 3; i++)
 			{
 				aabb.m_min[i] = aabbMin[i];
 				aabb.m_max[i] = aabbMax[i];
@@ -677,14 +644,16 @@ int		b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* po
 			{
 				m_data->m_allAabbsGPU->copyFromHost(m_data->m_allAabbsCPU);
 			}
-		} else
+		}
+		else
 		{
 			if (mass)
 			{
-				m_data->m_broadphaseSap->createProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher);
-			} else
+				m_data->m_broadphaseSap->createProxy(aabbMin, aabbMax, bodyIndex, 1, 1);  //m_dispatcher);
+			}
+			else
 			{
-				m_data->m_broadphaseSap->createLargeProxy(aabbMin,aabbMax,bodyIndex,1,1);//m_dispatcher);	
+				m_data->m_broadphaseSap->createLargeProxy(aabbMin, aabbMax, bodyIndex, 1, 1);  //m_dispatcher);
 			}
 		}
 	}
@@ -699,10 +668,10 @@ int		b3GpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* po
 	return bodyIndex;
 }

-void	b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults)
+void b3GpuRigidBodyPipeline::castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults)
 {
-	this->m_data->m_raycaster->castRays(rays,hitResults,
-		getNumBodies(),this->m_data->m_narrowphase->getBodiesCpu(),
-		m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
-		m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
+	this->m_data->m_raycaster->castRays(rays, hitResults,
+										getNumBodies(), this->m_data->m_narrowphase->getBodiesCpu(),
+										m_data->m_narrowphase->getNumCollidablesGpu(), m_data->m_narrowphase->getCollidablesCpu(),
+										m_data->m_narrowphase->getInternalData(), m_data->m_broadphaseSap);
 }
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.h
@@ -25,50 +25,46 @@ subject to the following restrictions:
 class b3GpuRigidBodyPipeline
 {
 protected:
-	struct b3GpuRigidBodyPipelineInternalData*	m_data;
+	struct b3GpuRigidBodyPipelineInternalData* m_data;

 	int allocateCollidable();

 public:
-
-
-	b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue  q , class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config);
+	b3GpuRigidBodyPipeline(cl_context ctx, cl_device_id device, cl_command_queue q, class b3GpuNarrowPhase* narrowphase, class b3GpuBroadphaseInterface* broadphaseSap, struct b3DynamicBvhBroadphase* broadphaseDbvt, const b3Config& config);
 	virtual ~b3GpuRigidBodyPipeline();

-	void	stepSimulation(float deltaTime);
-	void	integrate(float timeStep);
-	void	setupGpuAabbsFull();
+	void stepSimulation(float deltaTime);
+	void integrate(float timeStep);
+	void setupGpuAabbsFull();

-	int		registerConvexPolyhedron(class b3ConvexUtility* convex);
+	int registerConvexPolyhedron(class b3ConvexUtility* convex);

 	//int		registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
 	//int		registerSphereShape(float radius);
 	//int		registerPlaneShape(const b3Vector3& planeNormal, float planeConstant);
-	
+
 	//int		registerConcaveMesh(b3AlignedObjectArray<b3Vector3>* vertices, b3AlignedObjectArray<int>* indices, const float* scaling);
 	//int		registerCompoundShape(b3AlignedObjectArray<b3GpuChildShape>* childShapes);

-	
-	int		registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
+	int registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData, bool writeInstanceToGpu);
 	//if you passed "writeInstanceToGpu" false in the registerPhysicsInstance method (for performance) you need to call writeAllInstancesToGpu after all instances are registered
-	void	writeAllInstancesToGpu();
-	void	copyConstraintsToHost();
-	void	setGravity(const float* grav);
+	void writeAllInstancesToGpu();
+	void copyConstraintsToHost();
+	void setGravity(const float* grav);
 	void reset();
-	
-	int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB,float breakingThreshold);
+
+	int createPoint2PointConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, float breakingThreshold);
 	int createFixedConstraint(int bodyA, int bodyB, const float* pivotInA, const float* pivotInB, const float* relTargetAB, float breakingThreshold);
 	void removeConstraintByUid(int uid);

-	void	addConstraint(class b3TypedConstraint* constraint);
-	void	removeConstraint(b3TypedConstraint* constraint);
+	void addConstraint(class b3TypedConstraint* constraint);
+	void removeConstraint(b3TypedConstraint* constraint);

-	void	castRays(const b3AlignedObjectArray<b3RayInfo>& rays,	b3AlignedObjectArray<b3RayHit>& hitResults);
+	void castRays(const b3AlignedObjectArray<b3RayInfo>& rays, b3AlignedObjectArray<b3RayHit>& hitResults);

-	cl_mem	getBodyBuffer();
-
-	int	getNumBodies() const;
+	cl_mem getBodyBuffer();

+	int getNumBodies() const;
 };

-#endif //B3_GPU_RIGIDBODY_PIPELINE_H
+#endif  //B3_GPU_RIGIDBODY_PIPELINE_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
@@ -22,52 +22,47 @@ subject to the following restrictions:
 #include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
 #include "Bullet3Collision/NarrowPhaseCollision/shared/b3Collidable.h"

-
 #include "Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h"
 #include "Bullet3Dynamics/ConstraintSolver/b3TypedConstraint.h"
 #include "Bullet3Collision/NarrowPhaseCollision/b3Config.h"

-
-
 #include "Bullet3Collision/BroadPhaseCollision/b3OverlappingPair.h"
 #include "Bullet3OpenCL/RigidBody/b3GpuGenericConstraint.h"

 struct b3GpuRigidBodyPipelineInternalData
 {
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;

-	cl_context			m_context;
-	cl_device_id		m_device;
-	cl_command_queue	m_queue;
+	cl_kernel m_integrateTransformsKernel;
+	cl_kernel m_updateAabbsKernel;
+	cl_kernel m_clearOverlappingPairsKernel;

-	cl_kernel	m_integrateTransformsKernel;
-	cl_kernel	m_updateAabbsKernel;
-	cl_kernel	m_clearOverlappingPairsKernel;
-	
 	class b3PgsJacobiSolver* m_solver;
-	
+
 	class b3GpuPgsConstraintSolver* m_gpuSolver;

 	class b3GpuPgsContactSolver* m_solver2;
 	class b3GpuJacobiContactSolver* m_solver3;
 	class b3GpuRaycast* m_raycaster;
-	
+
 	class b3GpuBroadphaseInterface* m_broadphaseSap;
-	
+
 	struct b3DynamicBvhBroadphase* m_broadphaseDbvt;
-	b3OpenCLArray<b3SapAabb>*	m_allAabbsGPU;
-	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU;
-	b3OpenCLArray<b3BroadphasePair>*		m_overlappingPairsGPU;
+	b3OpenCLArray<b3SapAabb>* m_allAabbsGPU;
+	b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
+	b3OpenCLArray<b3BroadphasePair>* m_overlappingPairsGPU;

 	b3OpenCLArray<b3GpuGenericConstraint>* m_gpuConstraints;
 	b3AlignedObjectArray<b3GpuGenericConstraint> m_cpuConstraints;

 	b3AlignedObjectArray<b3TypedConstraint*> m_joints;
-	int	m_constraintUid;
-	class b3GpuNarrowPhase*	m_narrowphase;
-	b3Vector3	m_gravity;
+	int m_constraintUid;
+	class b3GpuNarrowPhase* m_narrowphase;
+	b3Vector3 m_gravity;

-	b3Config	m_config;
+	b3Config m_config;
 };

-#endif //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
-
+#endif  //B3_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuSolverBody.h
@@ -13,11 +13,9 @@ subject to the following restrictions:
 */
 //Originally written by Erwin Coumans

-
 #ifndef B3_GPU_SOLVER_BODY_H
 #define B3_GPU_SOLVER_BODY_H

-
 #include "Bullet3Common/b3Vector3.h"
 #include "Bullet3Common/b3Matrix3x3.h"

@@ -27,29 +25,27 @@ subject to the following restrictions:
 ///Until we get other contributions, only use SIMD on Windows, when using Visual Studio 2008 or later, and not double precision
 #ifdef B3_USE_SSE
 #define USE_SIMD 1
-#endif //
-
-
+#endif  //

 ///The b3SolverBody is an internal datastructure for the constraint solver. Only necessary data is packed to increase cache coherence/performance.
-B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverBody
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3GpuSolverBody
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();
-//	b3Transform		m_worldTransformUnused;
-	b3Vector3		m_deltaLinearVelocity;
-	b3Vector3		m_deltaAngularVelocity;
-	b3Vector3		m_angularFactor;
-	b3Vector3		m_linearFactor;
-	b3Vector3		m_invMass;
-	b3Vector3		m_pushVelocity;
-	b3Vector3		m_turnVelocity;
-	b3Vector3		m_linearVelocity;
-	b3Vector3		m_angularVelocity;
+	//	b3Transform		m_worldTransformUnused;
+	b3Vector3 m_deltaLinearVelocity;
+	b3Vector3 m_deltaAngularVelocity;
+	b3Vector3 m_angularFactor;
+	b3Vector3 m_linearFactor;
+	b3Vector3 m_invMass;
+	b3Vector3 m_pushVelocity;
+	b3Vector3 m_turnVelocity;
+	b3Vector3 m_linearVelocity;
+	b3Vector3 m_angularVelocity;

-	union 
-	{
-		void*	m_originalBody;
-		int		m_originalBodyIndex;
+	union {
+		void* m_originalBody;
+		int m_originalBodyIndex;
 	};

 	int padding[3];
@@ -65,44 +61,41 @@ B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverBody
 		return m_worldTransform;
 	}
 	*/
-	B3_FORCE_INLINE void	getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
+	B3_FORCE_INLINE void getVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
 	{
 		if (m_originalBody)
-			velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
+			velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
 		else
-			velocity.setValue(0,0,0);
+			velocity.setValue(0, 0, 0);
 	}

-	B3_FORCE_INLINE void	getAngularVelocity(b3Vector3& angVel) const
+	B3_FORCE_INLINE void getAngularVelocity(b3Vector3 & angVel) const
 	{
 		if (m_originalBody)
-			angVel =m_angularVelocity+m_deltaAngularVelocity;
+			angVel = m_angularVelocity + m_deltaAngularVelocity;
 		else
-			angVel.setValue(0,0,0);
+			angVel.setValue(0, 0, 0);
 	}

-
 	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
-	B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
+	B3_FORCE_INLINE void applyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
 	{
 		if (m_originalBody)
 		{
-			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
-			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+			m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
+			m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
 		}
 	}

-	B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,b3Scalar impulseMagnitude)
+	B3_FORCE_INLINE void internalApplyPushImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, b3Scalar impulseMagnitude)
 	{
 		if (m_originalBody)
 		{
-			m_pushVelocity += linearComponent*impulseMagnitude*m_linearFactor;
-			m_turnVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+			m_pushVelocity += linearComponent * impulseMagnitude * m_linearFactor;
+			m_turnVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
 		}
 	}

-
-
 	const b3Vector3& getDeltaLinearVelocity() const
 	{
 		return m_deltaLinearVelocity;
@@ -113,20 +106,19 @@ B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverBody
 		return m_deltaAngularVelocity;
 	}

-	const b3Vector3& getPushVelocity() const 
+	const b3Vector3& getPushVelocity() const
 	{
 		return m_pushVelocity;
 	}

-	const b3Vector3& getTurnVelocity() const 
+	const b3Vector3& getTurnVelocity() const
 	{
 		return m_turnVelocity;
 	}

-
 	////////////////////////////////////////////////
 	///some internal methods, don't use them
-		
+
 	b3Vector3& internalGetDeltaLinearVelocity()
 	{
 		return m_deltaLinearVelocity;
@@ -151,7 +143,7 @@ B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverBody
 	{
 		m_invMass = invMass;
 	}
-	
+
 	b3Vector3& internalGetPushVelocity()
 	{
 		return m_pushVelocity;
@@ -162,67 +154,57 @@ B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverBody
 		return m_turnVelocity;
 	}

-	B3_FORCE_INLINE void	internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity ) const
+	B3_FORCE_INLINE void internalGetVelocityInLocalPointObsolete(const b3Vector3& rel_pos, b3Vector3& velocity) const
 	{
-		velocity = m_linearVelocity+m_deltaLinearVelocity + (m_angularVelocity+m_deltaAngularVelocity).cross(rel_pos);
+		velocity = m_linearVelocity + m_deltaLinearVelocity + (m_angularVelocity + m_deltaAngularVelocity).cross(rel_pos);
 	}

-	B3_FORCE_INLINE void	internalGetAngularVelocity(b3Vector3& angVel) const
+	B3_FORCE_INLINE void internalGetAngularVelocity(b3Vector3 & angVel) const
 	{
-		angVel = m_angularVelocity+m_deltaAngularVelocity;
+		angVel = m_angularVelocity + m_deltaAngularVelocity;
 	}

-
 	//Optimization for the iterative solver: avoid calculating constant terms involving inertia, normal, relative position
-	B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent,const b3Scalar impulseMagnitude)
+	B3_FORCE_INLINE void internalApplyImpulse(const b3Vector3& linearComponent, const b3Vector3& angularComponent, const b3Scalar impulseMagnitude)
 	{
 		//if (m_originalBody)
 		{
-			m_deltaLinearVelocity += linearComponent*impulseMagnitude*m_linearFactor;
-			m_deltaAngularVelocity += angularComponent*(impulseMagnitude*m_angularFactor);
+			m_deltaLinearVelocity += linearComponent * impulseMagnitude * m_linearFactor;
+			m_deltaAngularVelocity += angularComponent * (impulseMagnitude * m_angularFactor);
 		}
 	}
-		
-	
-	

-	void	writebackVelocity()
+	void writebackVelocity()
 	{
 		//if (m_originalBody>=0)
 		{
-			m_linearVelocity +=m_deltaLinearVelocity;
+			m_linearVelocity += m_deltaLinearVelocity;
 			m_angularVelocity += m_deltaAngularVelocity;
-			
+
 			//m_originalBody->setCompanionId(-1);
 		}
 	}

-
-	void	writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
+	void writebackVelocityAndTransform(b3Scalar timeStep, b3Scalar splitImpulseTurnErp)
 	{
-        (void) timeStep;
+		(void)timeStep;
 		if (m_originalBody)
 		{
 			m_linearVelocity += m_deltaLinearVelocity;
 			m_angularVelocity += m_deltaAngularVelocity;
-			
+
 			//correct the position/orientation based on push/turn recovery
 			b3Transform newTransform;
-			if (m_pushVelocity[0]!=0.f || m_pushVelocity[1]!=0 || m_pushVelocity[2]!=0 || m_turnVelocity[0]!=0.f || m_turnVelocity[1]!=0 || m_turnVelocity[2]!=0)
+			if (m_pushVelocity[0] != 0.f || m_pushVelocity[1] != 0 || m_pushVelocity[2] != 0 || m_turnVelocity[0] != 0.f || m_turnVelocity[1] != 0 || m_turnVelocity[2] != 0)
 			{
-			//	b3Quaternion orn = m_worldTransform.getRotation();
-//				b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
-//				m_worldTransform = newTransform;
+				//	b3Quaternion orn = m_worldTransform.getRotation();
+				//				b3TransformUtil::integrateTransform(m_worldTransform,m_pushVelocity,m_turnVelocity*splitImpulseTurnErp,timeStep,newTransform);
+				//				m_worldTransform = newTransform;
 			}
 			//m_worldTransform.setRotation(orn);
 			//m_originalBody->setCompanionId(-1);
 		}
 	}
-	
-
-
 };

-#endif //B3_SOLVER_BODY_H
-
-
+#endif  //B3_SOLVER_BODY_H
--- a/src/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuSolverConstraint.h
@@ -13,11 +13,9 @@ subject to the following restrictions:
 3. This notice may not be removed or altered from any source distribution.
 */

-
 #ifndef B3_GPU_SOLVER_CONSTRAINT_H
 #define B3_GPU_SOLVER_CONSTRAINT_H

-
 #include "Bullet3Common/b3Vector3.h"
 #include "Bullet3Common/b3Matrix3x3.h"
 //#include "b3JacobianEntry.h"
@@ -25,58 +23,51 @@ subject to the following restrictions:

 //#define NO_FRICTION_TANGENTIALS 1

-
-
 ///1D constraint along a normal axis between bodyA and bodyB. It can be combined to solve contact and friction constraints.
-B3_ATTRIBUTE_ALIGNED16 (struct)	b3GpuSolverConstraint
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3GpuSolverConstraint
 {
 	B3_DECLARE_ALIGNED_ALLOCATOR();

-	b3Vector3		m_relpos1CrossNormal;
-	b3Vector3		m_contactNormal;
+	b3Vector3 m_relpos1CrossNormal;
+	b3Vector3 m_contactNormal;

-	b3Vector3		m_relpos2CrossNormal;
+	b3Vector3 m_relpos2CrossNormal;
 	//b3Vector3		m_contactNormal2;//usually m_contactNormal2 == -m_contactNormal

-	b3Vector3		m_angularComponentA;
-	b3Vector3		m_angularComponentB;
-	
-	mutable b3Scalar	m_appliedPushImpulse;
-	mutable b3Scalar	m_appliedImpulse;
+	b3Vector3 m_angularComponentA;
+	b3Vector3 m_angularComponentB;
+
+	mutable b3Scalar m_appliedPushImpulse;
+	mutable b3Scalar m_appliedImpulse;
 	int m_padding1;
 	int m_padding2;
-	b3Scalar	m_friction;
-	b3Scalar	m_jacDiagABInv;
-	b3Scalar		m_rhs;
-	b3Scalar		m_cfm;
-	
-    b3Scalar		m_lowerLimit;
-	b3Scalar		m_upperLimit;
-	b3Scalar		m_rhsPenetration;
-    union
-	{
-		void*		m_originalContactPoint;
-		int		m_originalConstraintIndex;
-		b3Scalar	m_unusedPadding4;
+	b3Scalar m_friction;
+	b3Scalar m_jacDiagABInv;
+	b3Scalar m_rhs;
+	b3Scalar m_cfm;
+
+	b3Scalar m_lowerLimit;
+	b3Scalar m_upperLimit;
+	b3Scalar m_rhsPenetration;
+	union {
+		void* m_originalContactPoint;
+		int m_originalConstraintIndex;
+		b3Scalar m_unusedPadding4;
 	};

-	int	m_overrideNumSolverIterations;
-    int			m_frictionIndex;
+	int m_overrideNumSolverIterations;
+	int m_frictionIndex;
 	int m_solverBodyIdA;
 	int m_solverBodyIdB;

-    
-	enum		b3SolverConstraintType
+	enum b3SolverConstraintType
 	{
 		B3_SOLVER_CONTACT_1D = 0,
 		B3_SOLVER_FRICTION_1D
 	};
 };

-typedef b3AlignedObjectArray<b3GpuSolverConstraint>	b3GpuConstraintArray;
-
-
-#endif //B3_GPU_SOLVER_CONSTRAINT_H
-
-
+typedef b3AlignedObjectArray<b3GpuSolverConstraint> b3GpuConstraintArray;

+#endif  //B3_GPU_SOLVER_CONSTRAINT_H
--- a/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3Solver.cpp
--- a/src/Bullet3OpenCL/RigidBody/b3Solver.h
+++ b/src/Bullet3OpenCL/RigidBody/b3Solver.h
@@ -13,7 +13,6 @@ subject to the following restrictions:
 */
 //Originally written by Takahiro Harada

-
 #ifndef __ADL_SOLVER_H
 #define __ADL_SOLVER_H

@@ -29,98 +28,83 @@ subject to the following restrictions:

 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"

-
-#define B3NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+#define B3NEXTMULTIPLEOF(num, alignment) (((num) / (alignment) + (((num) % (alignment) == 0) ? 0 : 1)) * (alignment))

 enum
 {
-	B3_SOLVER_N_SPLIT_X = 8,//16,//4,
-	B3_SOLVER_N_SPLIT_Y = 4,//16,//4,
-	B3_SOLVER_N_SPLIT_Z = 8,//,
-	B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X*B3_SOLVER_N_SPLIT_Y*B3_SOLVER_N_SPLIT_Z,
-	B3_SOLVER_N_BATCHES = 8,//4,//8,//4,
+	B3_SOLVER_N_SPLIT_X = 8,  //16,//4,
+	B3_SOLVER_N_SPLIT_Y = 4,  //16,//4,
+	B3_SOLVER_N_SPLIT_Z = 8,  //,
+	B3_SOLVER_N_CELLS = B3_SOLVER_N_SPLIT_X * B3_SOLVER_N_SPLIT_Y * B3_SOLVER_N_SPLIT_Z,
+	B3_SOLVER_N_BATCHES = 8,  //4,//8,//4,
 	B3_MAX_NUM_BATCHES = 128,
 };

 class b3SolverBase
 {
-	public:
-		
+public:
+	struct ConstraintCfg
+	{
+		ConstraintCfg(float dt = 0.f) : m_positionDrift(0.005f), m_positionConstraintCoeff(0.2f), m_dt(dt), m_staticIdx(-1) {}

-		struct ConstraintCfg
-		{
-			ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {}
-
-			float m_positionDrift;
-			float m_positionConstraintCoeff;
-			float m_dt;
-			bool m_enableParallelSolve;
-			float m_batchCellSize;
-			int m_staticIdx;
-		};
-		
+		float m_positionDrift;
+		float m_positionConstraintCoeff;
+		float m_dt;
+		bool m_enableParallelSolve;
+		float m_batchCellSize;
+		int m_staticIdx;
+	};
 };

 class b3Solver : public b3SolverBase
 {
-	public:
+public:
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;

-		cl_context m_context;
-		cl_device_id m_device;
-		cl_command_queue m_queue;
-				
+	b3OpenCLArray<unsigned int>* m_numConstraints;
+	b3OpenCLArray<unsigned int>* m_offsets;
+	b3OpenCLArray<int> m_batchSizes;

-		b3OpenCLArray<unsigned int>* m_numConstraints;
-		b3OpenCLArray<unsigned int>* m_offsets;
-		b3OpenCLArray<int> m_batchSizes;
-		
-		
-		int m_nIterations;
-		cl_kernel m_batchingKernel;
-		cl_kernel m_batchingKernelNew;
-		cl_kernel m_solveContactKernel;
-		cl_kernel m_solveFrictionKernel;
-		cl_kernel m_contactToConstraintKernel;
-		cl_kernel m_setSortDataKernel;
-		cl_kernel m_reorderContactKernel;
-		cl_kernel m_copyConstraintKernel;
+	int m_nIterations;
+	cl_kernel m_batchingKernel;
+	cl_kernel m_batchingKernelNew;
+	cl_kernel m_solveContactKernel;
+	cl_kernel m_solveFrictionKernel;
+	cl_kernel m_contactToConstraintKernel;
+	cl_kernel m_setSortDataKernel;
+	cl_kernel m_reorderContactKernel;
+	cl_kernel m_copyConstraintKernel;

-		class b3RadixSort32CL*	m_sort32;
-		class b3BoundSearchCL*	m_search;
-		class b3PrefixScanCL*	m_scan;
+	class b3RadixSort32CL* m_sort32;
+	class b3BoundSearchCL* m_search;
+	class b3PrefixScanCL* m_scan;

-		b3OpenCLArray<b3SortData>* m_sortDataBuffer;
-		b3OpenCLArray<b3Contact4>* m_contactBuffer2;
+	b3OpenCLArray<b3SortData>* m_sortDataBuffer;
+	b3OpenCLArray<b3Contact4>* m_contactBuffer2;

-		enum
-		{
-			DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
-		};
+	enum
+	{
+		DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
+	};

-		
+	b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);

-		
-		b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity);
+	virtual ~b3Solver();

-		virtual ~b3Solver();
-		
-		void solveContactConstraint( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf, 
-			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches);
+	void solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* inertiaBuf,
+								b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches);

-		void solveContactConstraintHost(  b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf, 
-			b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, b3AlignedObjectArray<int>* batchSizes);
+	void solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
+									b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes);

+	void convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
+							  const b3OpenCLArray<b3InertiaData>* shapeBuf,
+							  b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
+							  int nContacts, const ConstraintCfg& cfg);

-		void convertToConstraints( const b3OpenCLArray<b3RigidBodyData>* bodyBuf, 
-			const b3OpenCLArray<b3InertiaData>* shapeBuf, 
-			b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData, 
-			int nContacts, const ConstraintCfg& cfg );
-
-		void	batchContacts( b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx );
-
+	void batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* n, b3OpenCLArray<unsigned int>* offsets, int staticIdx);
 };

-
-
-
-#endif //__ADL_SOLVER_H
+#endif  //__ADL_SOLVER_H
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
@@ -1,388 +1,387 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* batchingKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"#ifndef B3_CONTACT4DATA_H\n"
-"#define B3_CONTACT4DATA_H\n"
-"#ifndef B3_FLOAT4_H\n"
-"#define B3_FLOAT4_H\n"
-"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
-"#define B3_PLATFORM_DEFINITIONS_H\n"
-"struct MyTest\n"
-"{\n"
-"	int bla;\n"
-"};\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
-"#define B3_LARGE_FLOAT 1e18f\n"
-"#define B3_INFINITY 1e18f\n"
-"#define b3Assert(a)\n"
-"#define b3ConstArray(a) __global const a*\n"
-"#define b3AtomicInc atomic_inc\n"
-"#define b3AtomicAdd atomic_add\n"
-"#define b3Fabs fabs\n"
-"#define b3Sqrt native_sqrt\n"
-"#define b3Sin native_sin\n"
-"#define b3Cos native_cos\n"
-"#define B3_STATIC\n"
-"#endif\n"
-"#endif\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"	typedef float4	b3Float4;\n"
-"	#define b3Float4ConstArg const b3Float4\n"
-"	#define b3MakeFloat4 (float4)\n"
-"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return dot(a1, b1);\n"
-"	}\n"
-"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return cross(a1, b1);\n"
-"	}\n"
-"	#define b3MinFloat4 min\n"
-"	#define b3MaxFloat4 max\n"
-"	#define b3Normalized(a) normalize(a)\n"
-"#endif \n"
-"		\n"
-"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
-"{\n"
-"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
-"		return false;\n"
-"	return true;\n"
-"}\n"
-"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
-"{\n"
-"    float maxDot = -B3_INFINITY;\n"
-"    int i = 0;\n"
-"    int ptIndex = -1;\n"
-"    for( i = 0; i < vecLen; i++ )\n"
-"    {\n"
-"        float dot = b3Dot3F4(vecArray[i],vec);\n"
-"            \n"
-"        if( dot > maxDot )\n"
-"        {\n"
-"            maxDot = dot;\n"
-"            ptIndex = i;\n"
-"        }\n"
-"    }\n"
-"	b3Assert(ptIndex>=0);\n"
-"    if (ptIndex<0)\n"
-"	{\n"
-"		ptIndex = 0;\n"
-"	}\n"
-"    *dotOut = maxDot;\n"
-"    return ptIndex;\n"
-"}\n"
-"#endif //B3_FLOAT4_H\n"
-"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
-"struct b3Contact4Data\n"
-"{\n"
-"	b3Float4	m_worldPosB[4];\n"
-"//	b3Float4	m_localPosA[4];\n"
-"//	b3Float4	m_localPosB[4];\n"
-"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
-"	unsigned short  m_restituitionCoeffCmp;\n"
-"	unsigned short  m_frictionCoeffCmp;\n"
-"	int m_batchIdx;\n"
-"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"};\n"
-"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
-"{\n"
-"	return (int)contact->m_worldNormalOnB.w;\n"
-"};\n"
-"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
-"{\n"
-"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
-"};\n"
-"#endif //B3_CONTACT4DATA_H\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"#ifdef cl_ext_atomic_counters_32\n"
-"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
-"#else\n"
-"#define counter32_t volatile __global int*\n"
-"#endif\n"
-"typedef unsigned int u32;\n"
-"typedef unsigned short u16;\n"
-"typedef unsigned char u8;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GET_NUM_GROUPS get_num_groups(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define AppendInc(x, out) out = atomic_inc(x)\n"
-"#define AtomAdd(x, value) atom_add(&(x), value)\n"
-"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
-"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"#define make_float4 (float4)\n"
-"#define make_float2 (float2)\n"
-"#define make_uint4 (uint4)\n"
-"#define make_int4 (int4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"#define max2 max\n"
-"#define min2 min\n"
-"#define WG_SIZE 64\n"
-"typedef struct \n"
-"{\n"
-"	int m_n;\n"
-"	int m_start;\n"
-"	int m_staticIdx;\n"
-"	int m_paddings[1];\n"
-"} ConstBuffer;\n"
-"typedef struct \n"
-"{\n"
-"	int m_a;\n"
-"	int m_b;\n"
-"	u32 m_idx;\n"
-"}Elem;\n"
-"#define STACK_SIZE (WG_SIZE*10)\n"
-"//#define STACK_SIZE (WG_SIZE)\n"
-"#define RING_SIZE 1024\n"
-"#define RING_SIZE_MASK (RING_SIZE-1)\n"
-"#define CHECK_SIZE (WG_SIZE)\n"
-"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
-"#define RING_END ldsTmp\n"
-"u32 readBuf(__local u32* buff, int idx)\n"
-"{\n"
-"	idx = idx % (32*CHECK_SIZE);\n"
-"	int bitIdx = idx%32;\n"
-"	int bufIdx = idx/32;\n"
-"	return buff[bufIdx] & (1<<bitIdx);\n"
-"}\n"
-"void writeBuf(__local u32* buff, int idx)\n"
-"{\n"
-"	idx = idx % (32*CHECK_SIZE);\n"
-"	int bitIdx = idx%32;\n"
-"	int bufIdx = idx/32;\n"
-"//	buff[bufIdx] |= (1<<bitIdx);\n"
-"	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
-"}\n"
-"u32 tryWrite(__local u32* buff, int idx)\n"
-"{\n"
-"	idx = idx % (32*CHECK_SIZE);\n"
-"	int bitIdx = idx%32;\n"
-"	int bufIdx = idx/32;\n"
-"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
-"	return ((ans >> bitIdx)&1) == 0;\n"
-"}\n"
-"//	batching on the GPU\n"
-"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
-"		__global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
-"		int m_staticIdx )\n"
-"{\n"
-"	__local u32 ldsStackIdx[STACK_SIZE];\n"
-"	__local u32 ldsStackEnd;\n"
-"	__local Elem ldsRingElem[RING_SIZE];\n"
-"	__local u32 ldsRingEnd;\n"
-"	__local u32 ldsTmp;\n"
-"	__local u32 ldsCheckBuffer[CHECK_SIZE];\n"
-"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
-"	__local u32 ldsGEnd;\n"
-"	__local u32 ldsDstEnd;\n"
-"	int wgIdx = GET_GROUP_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	\n"
-"	const int m_n = gN[wgIdx];\n"
-"	const int m_start = gStart[wgIdx];\n"
-"		\n"
-"	if( lIdx == 0 )\n"
-"	{\n"
-"		ldsRingEnd = 0;\n"
-"		ldsGEnd = 0;\n"
-"		ldsStackEnd = 0;\n"
-"		ldsDstEnd = m_start;\n"
-"	}\n"
-"	\n"
-"	\n"
-"	\n"
-"//	while(1)\n"
-"//was 250\n"
-"	int ie=0;\n"
-"	int maxBatch = 0;\n"
-"	for(ie=0; ie<50; ie++)\n"
-"	{\n"
-"		ldsFixedBuffer[lIdx] = 0;\n"
-"		for(int giter=0; giter<4; giter++)\n"
-"		{\n"
-"			int ringCap = GET_RING_CAPACITY;\n"
-"		\n"
-"			//	1. fill ring\n"
-"			if( ldsGEnd < m_n )\n"
-"			{\n"
-"				while( ringCap > WG_SIZE )\n"
-"				{\n"
-"					if( ldsGEnd >= m_n ) break;\n"
-"					if( lIdx < ringCap - WG_SIZE )\n"
-"					{\n"
-"						int srcIdx;\n"
-"						AtomInc1( ldsGEnd, srcIdx );\n"
-"						if( srcIdx < m_n )\n"
-"						{\n"
-"							int dstIdx;\n"
-"							AtomInc1( ldsRingEnd, dstIdx );\n"
-"							\n"
-"							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n"
-"							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n"
-"							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
-"							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
-"							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
-"						}\n"
-"					}\n"
-"					ringCap = GET_RING_CAPACITY;\n"
-"				}\n"
-"			}\n"
-"			GROUP_LDS_BARRIER;\n"
-"	\n"
-"			//	2. fill stack\n"
-"			__local Elem* dst = ldsRingElem;\n"
-"			if( lIdx == 0 ) RING_END = 0;\n"
-"			int srcIdx=lIdx;\n"
-"			int end = ldsRingEnd;\n"
-"			{\n"
-"				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
-"				{\n"
-"					Elem e;\n"
-"					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
-"					bool done = (srcIdx<end)?false:true;\n"
-"					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
-"					\n"
-"					if( !done )\n"
-"					{\n"
-"						int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
-"						int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
-"						if( aUsed==0 && bUsed==0 )\n"
-"						{\n"
-"							int aAvailable=1;\n"
-"							int bAvailable=1;\n"
-"							int ea = abs(e.m_a);\n"
-"							int eb = abs(e.m_b);\n"
-"							bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
-"							bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
-"							\n"
-"							if (!aStatic)\n"
-"								aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
-"							if (!bStatic)\n"
-"								bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
-"							\n"
-"							//aAvailable = aStatic? 1: aAvailable;\n"
-"							//bAvailable = bStatic? 1: bAvailable;\n"
-"							bool success = (aAvailable && bAvailable);\n"
-"							if(success)\n"
-"							{\n"
-"							\n"
-"								if (!aStatic)\n"
-"									writeBuf( ldsFixedBuffer, ea );\n"
-"								if (!bStatic)\n"
-"									writeBuf( ldsFixedBuffer, eb );\n"
-"							}\n"
-"							done = success;\n"
-"						}\n"
-"					}\n"
-"					//	put it aside\n"
-"					if(srcIdx<end)\n"
-"					{\n"
-"						if( done )\n"
-"						{\n"
-"							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
-"							if( dstIdx < STACK_SIZE )\n"
-"								ldsStackIdx[dstIdx] = e.m_idx;\n"
-"							else{\n"
-"								done = false;\n"
-"								AtomAdd( ldsStackEnd, -1 );\n"
-"							}\n"
-"						}\n"
-"						if( !done )\n"
-"						{\n"
-"							int dstIdx; AtomInc1( RING_END, dstIdx );\n"
-"							dst[dstIdx] = e;\n"
-"						}\n"
-"					}\n"
-"					//	if filled, flush\n"
-"					if( ldsStackEnd == STACK_SIZE )\n"
-"					{\n"
-"						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
-"						{\n"
-"							int idx = m_start + ldsStackIdx[i];\n"
-"							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
-"							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
-"							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
-"						}\n"
-"						if( lIdx == 0 ) ldsStackEnd = 0;\n"
-"						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
-"						ldsFixedBuffer[lIdx] = 0;\n"
-"					}\n"
-"				}\n"
-"			}\n"
-"			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
-"		{\n"
-"			int idx = m_start + ldsStackIdx[i];\n"
-"			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
-"			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
-"			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
-"		}\n"
-"		//	in case it couldn't consume any pair. Flush them\n"
-"		//	todo. Serial batch worth while?\n"
-"		if( ldsStackEnd == 0 )\n"
-"		{\n"
-"			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
-"			{\n"
-"				int idx = m_start + ldsRingElem[i].m_idx;\n"
-"				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
-"				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
-"				int curBatch = 100+i;\n"
-"				if (maxBatch < curBatch)\n"
-"					maxBatch = curBatch;\n"
-"				\n"
-"				gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
-"				\n"
-"			}\n"
-"			GROUP_LDS_BARRIER;\n"
-"			if( lIdx == 0 ) ldsRingEnd = 0;\n"
-"		}\n"
-"		if( lIdx == 0 ) ldsStackEnd = 0;\n"
-"		GROUP_LDS_BARRIER;\n"
-"		//	termination\n"
-"		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
-"			break;\n"
-"	}\n"
-"	if( lIdx == 0 )\n"
-"	{\n"
-"		if (maxBatch < ie)\n"
-"			maxBatch=ie;\n"
-"		batchSizes[wgIdx]=maxBatch;\n"
-"	}\n"
-"}\n"
-;
+static const char* batchingKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"#ifndef B3_CONTACT4DATA_H\n"
+	"#define B3_CONTACT4DATA_H\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#define B3_FLOAT4_H\n"
+	"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+	"#define B3_PLATFORM_DEFINITIONS_H\n"
+	"struct MyTest\n"
+	"{\n"
+	"	int bla;\n"
+	"};\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+	"#define B3_LARGE_FLOAT 1e18f\n"
+	"#define B3_INFINITY 1e18f\n"
+	"#define b3Assert(a)\n"
+	"#define b3ConstArray(a) __global const a*\n"
+	"#define b3AtomicInc atomic_inc\n"
+	"#define b3AtomicAdd atomic_add\n"
+	"#define b3Fabs fabs\n"
+	"#define b3Sqrt native_sqrt\n"
+	"#define b3Sin native_sin\n"
+	"#define b3Cos native_cos\n"
+	"#define B3_STATIC\n"
+	"#endif\n"
+	"#endif\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"	typedef float4	b3Float4;\n"
+	"	#define b3Float4ConstArg const b3Float4\n"
+	"	#define b3MakeFloat4 (float4)\n"
+	"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return dot(a1, b1);\n"
+	"	}\n"
+	"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return cross(a1, b1);\n"
+	"	}\n"
+	"	#define b3MinFloat4 min\n"
+	"	#define b3MaxFloat4 max\n"
+	"	#define b3Normalized(a) normalize(a)\n"
+	"#endif \n"
+	"		\n"
+	"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+	"{\n"
+	"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+	"		return false;\n"
+	"	return true;\n"
+	"}\n"
+	"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+	"{\n"
+	"    float maxDot = -B3_INFINITY;\n"
+	"    int i = 0;\n"
+	"    int ptIndex = -1;\n"
+	"    for( i = 0; i < vecLen; i++ )\n"
+	"    {\n"
+	"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+	"            \n"
+	"        if( dot > maxDot )\n"
+	"        {\n"
+	"            maxDot = dot;\n"
+	"            ptIndex = i;\n"
+	"        }\n"
+	"    }\n"
+	"	b3Assert(ptIndex>=0);\n"
+	"    if (ptIndex<0)\n"
+	"	{\n"
+	"		ptIndex = 0;\n"
+	"	}\n"
+	"    *dotOut = maxDot;\n"
+	"    return ptIndex;\n"
+	"}\n"
+	"#endif //B3_FLOAT4_H\n"
+	"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+	"struct b3Contact4Data\n"
+	"{\n"
+	"	b3Float4	m_worldPosB[4];\n"
+	"//	b3Float4	m_localPosA[4];\n"
+	"//	b3Float4	m_localPosB[4];\n"
+	"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+	"	unsigned short  m_restituitionCoeffCmp;\n"
+	"	unsigned short  m_frictionCoeffCmp;\n"
+	"	int m_batchIdx;\n"
+	"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+	"	int m_bodyBPtrAndSignBit;\n"
+	"	int	m_childIndexA;\n"
+	"	int	m_childIndexB;\n"
+	"	int m_unused1;\n"
+	"	int m_unused2;\n"
+	"};\n"
+	"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+	"{\n"
+	"	return (int)contact->m_worldNormalOnB.w;\n"
+	"};\n"
+	"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+	"{\n"
+	"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+	"};\n"
+	"#endif //B3_CONTACT4DATA_H\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+	"#ifdef cl_ext_atomic_counters_32\n"
+	"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+	"#else\n"
+	"#define counter32_t volatile __global int*\n"
+	"#endif\n"
+	"typedef unsigned int u32;\n"
+	"typedef unsigned short u16;\n"
+	"typedef unsigned char u8;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GET_NUM_GROUPS get_num_groups(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define AppendInc(x, out) out = atomic_inc(x)\n"
+	"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+	"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+	"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+	"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+	"#define make_float4 (float4)\n"
+	"#define make_float2 (float2)\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_int4 (int4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"#define max2 max\n"
+	"#define min2 min\n"
+	"#define WG_SIZE 64\n"
+	"typedef struct \n"
+	"{\n"
+	"	int m_n;\n"
+	"	int m_start;\n"
+	"	int m_staticIdx;\n"
+	"	int m_paddings[1];\n"
+	"} ConstBuffer;\n"
+	"typedef struct \n"
+	"{\n"
+	"	int m_a;\n"
+	"	int m_b;\n"
+	"	u32 m_idx;\n"
+	"}Elem;\n"
+	"#define STACK_SIZE (WG_SIZE*10)\n"
+	"//#define STACK_SIZE (WG_SIZE)\n"
+	"#define RING_SIZE 1024\n"
+	"#define RING_SIZE_MASK (RING_SIZE-1)\n"
+	"#define CHECK_SIZE (WG_SIZE)\n"
+	"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
+	"#define RING_END ldsTmp\n"
+	"u32 readBuf(__local u32* buff, int idx)\n"
+	"{\n"
+	"	idx = idx % (32*CHECK_SIZE);\n"
+	"	int bitIdx = idx%32;\n"
+	"	int bufIdx = idx/32;\n"
+	"	return buff[bufIdx] & (1<<bitIdx);\n"
+	"}\n"
+	"void writeBuf(__local u32* buff, int idx)\n"
+	"{\n"
+	"	idx = idx % (32*CHECK_SIZE);\n"
+	"	int bitIdx = idx%32;\n"
+	"	int bufIdx = idx/32;\n"
+	"//	buff[bufIdx] |= (1<<bitIdx);\n"
+	"	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+	"}\n"
+	"u32 tryWrite(__local u32* buff, int idx)\n"
+	"{\n"
+	"	idx = idx % (32*CHECK_SIZE);\n"
+	"	int bitIdx = idx%32;\n"
+	"	int bufIdx = idx/32;\n"
+	"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+	"	return ((ans >> bitIdx)&1) == 0;\n"
+	"}\n"
+	"//	batching on the GPU\n"
+	"__kernel void CreateBatches( __global const struct b3Contact4Data* gConstraints, __global struct b3Contact4Data* gConstraintsOut,\n"
+	"		__global const u32* gN, __global const u32* gStart, __global int* batchSizes, \n"
+	"		int m_staticIdx )\n"
+	"{\n"
+	"	__local u32 ldsStackIdx[STACK_SIZE];\n"
+	"	__local u32 ldsStackEnd;\n"
+	"	__local Elem ldsRingElem[RING_SIZE];\n"
+	"	__local u32 ldsRingEnd;\n"
+	"	__local u32 ldsTmp;\n"
+	"	__local u32 ldsCheckBuffer[CHECK_SIZE];\n"
+	"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+	"	__local u32 ldsGEnd;\n"
+	"	__local u32 ldsDstEnd;\n"
+	"	int wgIdx = GET_GROUP_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	\n"
+	"	const int m_n = gN[wgIdx];\n"
+	"	const int m_start = gStart[wgIdx];\n"
+	"		\n"
+	"	if( lIdx == 0 )\n"
+	"	{\n"
+	"		ldsRingEnd = 0;\n"
+	"		ldsGEnd = 0;\n"
+	"		ldsStackEnd = 0;\n"
+	"		ldsDstEnd = m_start;\n"
+	"	}\n"
+	"	\n"
+	"	\n"
+	"	\n"
+	"//	while(1)\n"
+	"//was 250\n"
+	"	int ie=0;\n"
+	"	int maxBatch = 0;\n"
+	"	for(ie=0; ie<50; ie++)\n"
+	"	{\n"
+	"		ldsFixedBuffer[lIdx] = 0;\n"
+	"		for(int giter=0; giter<4; giter++)\n"
+	"		{\n"
+	"			int ringCap = GET_RING_CAPACITY;\n"
+	"		\n"
+	"			//	1. fill ring\n"
+	"			if( ldsGEnd < m_n )\n"
+	"			{\n"
+	"				while( ringCap > WG_SIZE )\n"
+	"				{\n"
+	"					if( ldsGEnd >= m_n ) break;\n"
+	"					if( lIdx < ringCap - WG_SIZE )\n"
+	"					{\n"
+	"						int srcIdx;\n"
+	"						AtomInc1( ldsGEnd, srcIdx );\n"
+	"						if( srcIdx < m_n )\n"
+	"						{\n"
+	"							int dstIdx;\n"
+	"							AtomInc1( ldsRingEnd, dstIdx );\n"
+	"							\n"
+	"							int a = gConstraints[m_start+srcIdx].m_bodyAPtrAndSignBit;\n"
+	"							int b = gConstraints[m_start+srcIdx].m_bodyBPtrAndSignBit;\n"
+	"							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
+	"							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
+	"							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
+	"						}\n"
+	"					}\n"
+	"					ringCap = GET_RING_CAPACITY;\n"
+	"				}\n"
+	"			}\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"	\n"
+	"			//	2. fill stack\n"
+	"			__local Elem* dst = ldsRingElem;\n"
+	"			if( lIdx == 0 ) RING_END = 0;\n"
+	"			int srcIdx=lIdx;\n"
+	"			int end = ldsRingEnd;\n"
+	"			{\n"
+	"				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
+	"				{\n"
+	"					Elem e;\n"
+	"					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
+	"					bool done = (srcIdx<end)?false:true;\n"
+	"					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
+	"					\n"
+	"					if( !done )\n"
+	"					{\n"
+	"						int aUsed = readBuf( ldsFixedBuffer, abs(e.m_a));\n"
+	"						int bUsed = readBuf( ldsFixedBuffer, abs(e.m_b));\n"
+	"						if( aUsed==0 && bUsed==0 )\n"
+	"						{\n"
+	"							int aAvailable=1;\n"
+	"							int bAvailable=1;\n"
+	"							int ea = abs(e.m_a);\n"
+	"							int eb = abs(e.m_b);\n"
+	"							bool aStatic = (e.m_a<0) ||(ea==m_staticIdx);\n"
+	"							bool bStatic = (e.m_b<0) ||(eb==m_staticIdx);\n"
+	"							\n"
+	"							if (!aStatic)\n"
+	"								aAvailable = tryWrite( ldsCheckBuffer, ea );\n"
+	"							if (!bStatic)\n"
+	"								bAvailable = tryWrite( ldsCheckBuffer, eb );\n"
+	"							\n"
+	"							//aAvailable = aStatic? 1: aAvailable;\n"
+	"							//bAvailable = bStatic? 1: bAvailable;\n"
+	"							bool success = (aAvailable && bAvailable);\n"
+	"							if(success)\n"
+	"							{\n"
+	"							\n"
+	"								if (!aStatic)\n"
+	"									writeBuf( ldsFixedBuffer, ea );\n"
+	"								if (!bStatic)\n"
+	"									writeBuf( ldsFixedBuffer, eb );\n"
+	"							}\n"
+	"							done = success;\n"
+	"						}\n"
+	"					}\n"
+	"					//	put it aside\n"
+	"					if(srcIdx<end)\n"
+	"					{\n"
+	"						if( done )\n"
+	"						{\n"
+	"							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
+	"							if( dstIdx < STACK_SIZE )\n"
+	"								ldsStackIdx[dstIdx] = e.m_idx;\n"
+	"							else{\n"
+	"								done = false;\n"
+	"								AtomAdd( ldsStackEnd, -1 );\n"
+	"							}\n"
+	"						}\n"
+	"						if( !done )\n"
+	"						{\n"
+	"							int dstIdx; AtomInc1( RING_END, dstIdx );\n"
+	"							dst[dstIdx] = e;\n"
+	"						}\n"
+	"					}\n"
+	"					//	if filled, flush\n"
+	"					if( ldsStackEnd == STACK_SIZE )\n"
+	"					{\n"
+	"						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
+	"						{\n"
+	"							int idx = m_start + ldsStackIdx[i];\n"
+	"							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+	"							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+	"							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+	"						}\n"
+	"						if( lIdx == 0 ) ldsStackEnd = 0;\n"
+	"						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
+	"						ldsFixedBuffer[lIdx] = 0;\n"
+	"					}\n"
+	"				}\n"
+	"			}\n"
+	"			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
+	"		{\n"
+	"			int idx = m_start + ldsStackIdx[i];\n"
+	"			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+	"			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+	"			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+	"		}\n"
+	"		//	in case it couldn't consume any pair. Flush them\n"
+	"		//	todo. Serial batch worth while?\n"
+	"		if( ldsStackEnd == 0 )\n"
+	"		{\n"
+	"			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
+	"			{\n"
+	"				int idx = m_start + ldsRingElem[i].m_idx;\n"
+	"				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+	"				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+	"				int curBatch = 100+i;\n"
+	"				if (maxBatch < curBatch)\n"
+	"					maxBatch = curBatch;\n"
+	"				\n"
+	"				gConstraintsOut[ dstIdx ].m_batchIdx = curBatch;\n"
+	"				\n"
+	"			}\n"
+	"			GROUP_LDS_BARRIER;\n"
+	"			if( lIdx == 0 ) ldsRingEnd = 0;\n"
+	"		}\n"
+	"		if( lIdx == 0 ) ldsStackEnd = 0;\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		//	termination\n"
+	"		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
+	"			break;\n"
+	"	}\n"
+	"	if( lIdx == 0 )\n"
+	"	{\n"
+	"		if (maxBatch < ie)\n"
+	"			maxBatch=ie;\n"
+	"		batchSizes[wgIdx]=maxBatch;\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
@@ -1,291 +1,290 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* batchingKernelsNewCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Erwin Coumans\n"
-"#ifndef B3_CONTACT4DATA_H\n"
-"#define B3_CONTACT4DATA_H\n"
-"#ifndef B3_FLOAT4_H\n"
-"#define B3_FLOAT4_H\n"
-"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
-"#define B3_PLATFORM_DEFINITIONS_H\n"
-"struct MyTest\n"
-"{\n"
-"	int bla;\n"
-"};\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
-"#define B3_LARGE_FLOAT 1e18f\n"
-"#define B3_INFINITY 1e18f\n"
-"#define b3Assert(a)\n"
-"#define b3ConstArray(a) __global const a*\n"
-"#define b3AtomicInc atomic_inc\n"
-"#define b3AtomicAdd atomic_add\n"
-"#define b3Fabs fabs\n"
-"#define b3Sqrt native_sqrt\n"
-"#define b3Sin native_sin\n"
-"#define b3Cos native_cos\n"
-"#define B3_STATIC\n"
-"#endif\n"
-"#endif\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"	typedef float4	b3Float4;\n"
-"	#define b3Float4ConstArg const b3Float4\n"
-"	#define b3MakeFloat4 (float4)\n"
-"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return dot(a1, b1);\n"
-"	}\n"
-"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return cross(a1, b1);\n"
-"	}\n"
-"	#define b3MinFloat4 min\n"
-"	#define b3MaxFloat4 max\n"
-"	#define b3Normalized(a) normalize(a)\n"
-"#endif \n"
-"		\n"
-"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
-"{\n"
-"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
-"		return false;\n"
-"	return true;\n"
-"}\n"
-"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
-"{\n"
-"    float maxDot = -B3_INFINITY;\n"
-"    int i = 0;\n"
-"    int ptIndex = -1;\n"
-"    for( i = 0; i < vecLen; i++ )\n"
-"    {\n"
-"        float dot = b3Dot3F4(vecArray[i],vec);\n"
-"            \n"
-"        if( dot > maxDot )\n"
-"        {\n"
-"            maxDot = dot;\n"
-"            ptIndex = i;\n"
-"        }\n"
-"    }\n"
-"	b3Assert(ptIndex>=0);\n"
-"    if (ptIndex<0)\n"
-"	{\n"
-"		ptIndex = 0;\n"
-"	}\n"
-"    *dotOut = maxDot;\n"
-"    return ptIndex;\n"
-"}\n"
-"#endif //B3_FLOAT4_H\n"
-"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
-"struct b3Contact4Data\n"
-"{\n"
-"	b3Float4	m_worldPosB[4];\n"
-"//	b3Float4	m_localPosA[4];\n"
-"//	b3Float4	m_localPosB[4];\n"
-"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
-"	unsigned short  m_restituitionCoeffCmp;\n"
-"	unsigned short  m_frictionCoeffCmp;\n"
-"	int m_batchIdx;\n"
-"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
-"	int m_bodyBPtrAndSignBit;\n"
-"	int	m_childIndexA;\n"
-"	int	m_childIndexB;\n"
-"	int m_unused1;\n"
-"	int m_unused2;\n"
-"};\n"
-"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
-"{\n"
-"	return (int)contact->m_worldNormalOnB.w;\n"
-"};\n"
-"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
-"{\n"
-"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
-"};\n"
-"#endif //B3_CONTACT4DATA_H\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"#ifdef cl_ext_atomic_counters_32\n"
-"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
-"#else\n"
-"#define counter32_t volatile __global int*\n"
-"#endif\n"
-"#define SIMD_WIDTH 64\n"
-"typedef unsigned int u32;\n"
-"typedef unsigned short u16;\n"
-"typedef unsigned char u8;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GET_NUM_GROUPS get_num_groups(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define AppendInc(x, out) out = atomic_inc(x)\n"
-"#define AtomAdd(x, value) atom_add(&(x), value)\n"
-"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
-"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"#define make_float4 (float4)\n"
-"#define make_float2 (float2)\n"
-"#define make_uint4 (uint4)\n"
-"#define make_int4 (int4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"#define max2 max\n"
-"#define min2 min\n"
-"#define WG_SIZE 64\n"
-"typedef struct \n"
-"{\n"
-"	int m_n;\n"
-"	int m_start;\n"
-"	int m_staticIdx;\n"
-"	int m_paddings[1];\n"
-"} ConstBuffer;\n"
-"typedef struct \n"
-"{\n"
-"	int m_a;\n"
-"	int m_b;\n"
-"	u32 m_idx;\n"
-"}Elem;\n"
-"//	batching on the GPU\n"
-"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
-"{\n"
-"	int wgIdx = GET_GROUP_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	\n"
-"	const int m_n = gN[wgIdx];\n"
-"	const int m_start = gStart[wgIdx];\n"
-"		\n"
-"	if( lIdx == 0 )\n"
-"	{\n"
-"		for (int i=0;i<m_n;i++)\n"
-"		{\n"
-"			int srcIdx = i+m_start;\n"
-"			int batchIndex = i;\n"
-"			gConstraints[ srcIdx ].m_batchIdx = batchIndex;	\n"
-"		}\n"
-"	}\n"
-"}\n"
-"#define CHECK_SIZE (WG_SIZE)\n"
-"u32 readBuf(__local u32* buff, int idx)\n"
-"{\n"
-"	idx = idx % (32*CHECK_SIZE);\n"
-"	int bitIdx = idx%32;\n"
-"	int bufIdx = idx/32;\n"
-"	return buff[bufIdx] & (1<<bitIdx);\n"
-"}\n"
-"void writeBuf(__local u32* buff, int idx)\n"
-"{\n"
-"	idx = idx % (32*CHECK_SIZE);\n"
-"	int bitIdx = idx%32;\n"
-"	int bufIdx = idx/32;\n"
-"	buff[bufIdx] |= (1<<bitIdx);\n"
-"	//atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
-"}\n"
-"u32 tryWrite(__local u32* buff, int idx)\n"
-"{\n"
-"	idx = idx % (32*CHECK_SIZE);\n"
-"	int bitIdx = idx%32;\n"
-"	int bufIdx = idx/32;\n"
-"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
-"	return ((ans >> bitIdx)&1) == 0;\n"
-"}\n"
-"//	batching on the GPU\n"
-"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
-"{\n"
-"	int wgIdx = GET_GROUP_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	const int numConstraints = gN[wgIdx];\n"
-"	const int m_start = gStart[wgIdx];\n"
-"	b3Contact4Data_t tmp;\n"
-"	\n"
-"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
-"		\n"
-"	\n"
-"	\n"
-"	\n"
-"	\n"
-"	if( lIdx == 0 )\n"
-"	{\n"
-"	\n"
-"		\n"
-"		__global struct b3Contact4Data* cs = &gConstraints[m_start];	\n"
-"	\n"
-"		\n"
-"		int numValidConstraints = 0;\n"
-"		int batchIdx = 0;\n"
-"		while( numValidConstraints < numConstraints)\n"
-"		{\n"
-"			int nCurrentBatch = 0;\n"
-"			//	clear flag\n"
-"	\n"
-"			for(int i=0; i<CHECK_SIZE; i++) \n"
-"				ldsFixedBuffer[i] = 0;		\n"
-"			for(int i=numValidConstraints; i<numConstraints; i++)\n"
-"			{\n"
-"				int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
-"				int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
-"				int bodyA = abs(bodyAS);\n"
-"				int bodyB = abs(bodyBS);\n"
-"				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n"
-"				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n"
-"				int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n"
-"				int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n"
-"				\n"
-"				if( aUnavailable==0 && bUnavailable==0 ) // ok\n"
-"				{\n"
-"					if (!aIsStatic)\n"
-"					{\n"
-"						writeBuf( ldsFixedBuffer, bodyA );\n"
-"					}\n"
-"					if (!bIsStatic)\n"
-"					{\n"
-"						writeBuf( ldsFixedBuffer, bodyB );\n"
-"					}\n"
-"					cs[i].m_batchIdx = batchIdx;\n"
-"					if (i!=numValidConstraints)\n"
-"					{\n"
-"						tmp = cs[i];\n"
-"						cs[i] = cs[numValidConstraints];\n"
-"						cs[numValidConstraints]  = tmp;\n"
-"					}\n"
-"					numValidConstraints++;\n"
-"					\n"
-"					nCurrentBatch++;\n"
-"					if( nCurrentBatch == SIMD_WIDTH)\n"
-"					{\n"
-"						nCurrentBatch = 0;\n"
-"						for(int i=0; i<CHECK_SIZE; i++) \n"
-"							ldsFixedBuffer[i] = 0;\n"
-"						\n"
-"					}\n"
-"				}\n"
-"			}//for\n"
-"			batchIdx ++;\n"
-"		}//while\n"
-"		\n"
-"		batchSizes[wgIdx] = batchIdx;\n"
-"	}//if( lIdx == 0 )\n"
-"	\n"
-"	//return batchIdx;\n"
-"}\n"
-;
+static const char* batchingKernelsNewCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Erwin Coumans\n"
+	"#ifndef B3_CONTACT4DATA_H\n"
+	"#define B3_CONTACT4DATA_H\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#define B3_FLOAT4_H\n"
+	"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+	"#define B3_PLATFORM_DEFINITIONS_H\n"
+	"struct MyTest\n"
+	"{\n"
+	"	int bla;\n"
+	"};\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+	"#define B3_LARGE_FLOAT 1e18f\n"
+	"#define B3_INFINITY 1e18f\n"
+	"#define b3Assert(a)\n"
+	"#define b3ConstArray(a) __global const a*\n"
+	"#define b3AtomicInc atomic_inc\n"
+	"#define b3AtomicAdd atomic_add\n"
+	"#define b3Fabs fabs\n"
+	"#define b3Sqrt native_sqrt\n"
+	"#define b3Sin native_sin\n"
+	"#define b3Cos native_cos\n"
+	"#define B3_STATIC\n"
+	"#endif\n"
+	"#endif\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"	typedef float4	b3Float4;\n"
+	"	#define b3Float4ConstArg const b3Float4\n"
+	"	#define b3MakeFloat4 (float4)\n"
+	"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return dot(a1, b1);\n"
+	"	}\n"
+	"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return cross(a1, b1);\n"
+	"	}\n"
+	"	#define b3MinFloat4 min\n"
+	"	#define b3MaxFloat4 max\n"
+	"	#define b3Normalized(a) normalize(a)\n"
+	"#endif \n"
+	"		\n"
+	"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+	"{\n"
+	"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+	"		return false;\n"
+	"	return true;\n"
+	"}\n"
+	"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+	"{\n"
+	"    float maxDot = -B3_INFINITY;\n"
+	"    int i = 0;\n"
+	"    int ptIndex = -1;\n"
+	"    for( i = 0; i < vecLen; i++ )\n"
+	"    {\n"
+	"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+	"            \n"
+	"        if( dot > maxDot )\n"
+	"        {\n"
+	"            maxDot = dot;\n"
+	"            ptIndex = i;\n"
+	"        }\n"
+	"    }\n"
+	"	b3Assert(ptIndex>=0);\n"
+	"    if (ptIndex<0)\n"
+	"	{\n"
+	"		ptIndex = 0;\n"
+	"	}\n"
+	"    *dotOut = maxDot;\n"
+	"    return ptIndex;\n"
+	"}\n"
+	"#endif //B3_FLOAT4_H\n"
+	"typedef  struct b3Contact4Data b3Contact4Data_t;\n"
+	"struct b3Contact4Data\n"
+	"{\n"
+	"	b3Float4	m_worldPosB[4];\n"
+	"//	b3Float4	m_localPosA[4];\n"
+	"//	b3Float4	m_localPosB[4];\n"
+	"	b3Float4	m_worldNormalOnB;	//	w: m_nPoints\n"
+	"	unsigned short  m_restituitionCoeffCmp;\n"
+	"	unsigned short  m_frictionCoeffCmp;\n"
+	"	int m_batchIdx;\n"
+	"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+	"	int m_bodyBPtrAndSignBit;\n"
+	"	int	m_childIndexA;\n"
+	"	int	m_childIndexB;\n"
+	"	int m_unused1;\n"
+	"	int m_unused2;\n"
+	"};\n"
+	"inline int b3Contact4Data_getNumPoints(const struct b3Contact4Data* contact)\n"
+	"{\n"
+	"	return (int)contact->m_worldNormalOnB.w;\n"
+	"};\n"
+	"inline void b3Contact4Data_setNumPoints(struct b3Contact4Data* contact, int numPoints)\n"
+	"{\n"
+	"	contact->m_worldNormalOnB.w = (float)numPoints;\n"
+	"};\n"
+	"#endif //B3_CONTACT4DATA_H\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+	"#ifdef cl_ext_atomic_counters_32\n"
+	"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+	"#else\n"
+	"#define counter32_t volatile __global int*\n"
+	"#endif\n"
+	"#define SIMD_WIDTH 64\n"
+	"typedef unsigned int u32;\n"
+	"typedef unsigned short u16;\n"
+	"typedef unsigned char u8;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GET_NUM_GROUPS get_num_groups(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define AppendInc(x, out) out = atomic_inc(x)\n"
+	"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+	"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+	"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+	"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+	"#define make_float4 (float4)\n"
+	"#define make_float2 (float2)\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_int4 (int4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"#define max2 max\n"
+	"#define min2 min\n"
+	"#define WG_SIZE 64\n"
+	"typedef struct \n"
+	"{\n"
+	"	int m_n;\n"
+	"	int m_start;\n"
+	"	int m_staticIdx;\n"
+	"	int m_paddings[1];\n"
+	"} ConstBuffer;\n"
+	"typedef struct \n"
+	"{\n"
+	"	int m_a;\n"
+	"	int m_b;\n"
+	"	u32 m_idx;\n"
+	"}Elem;\n"
+	"//	batching on the GPU\n"
+	"__kernel void CreateBatchesBruteForce( __global struct b3Contact4Data* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
+	"{\n"
+	"	int wgIdx = GET_GROUP_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	\n"
+	"	const int m_n = gN[wgIdx];\n"
+	"	const int m_start = gStart[wgIdx];\n"
+	"		\n"
+	"	if( lIdx == 0 )\n"
+	"	{\n"
+	"		for (int i=0;i<m_n;i++)\n"
+	"		{\n"
+	"			int srcIdx = i+m_start;\n"
+	"			int batchIndex = i;\n"
+	"			gConstraints[ srcIdx ].m_batchIdx = batchIndex;	\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"#define CHECK_SIZE (WG_SIZE)\n"
+	"u32 readBuf(__local u32* buff, int idx)\n"
+	"{\n"
+	"	idx = idx % (32*CHECK_SIZE);\n"
+	"	int bitIdx = idx%32;\n"
+	"	int bufIdx = idx/32;\n"
+	"	return buff[bufIdx] & (1<<bitIdx);\n"
+	"}\n"
+	"void writeBuf(__local u32* buff, int idx)\n"
+	"{\n"
+	"	idx = idx % (32*CHECK_SIZE);\n"
+	"	int bitIdx = idx%32;\n"
+	"	int bufIdx = idx/32;\n"
+	"	buff[bufIdx] |= (1<<bitIdx);\n"
+	"	//atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+	"}\n"
+	"u32 tryWrite(__local u32* buff, int idx)\n"
+	"{\n"
+	"	idx = idx % (32*CHECK_SIZE);\n"
+	"	int bitIdx = idx%32;\n"
+	"	int bufIdx = idx/32;\n"
+	"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+	"	return ((ans >> bitIdx)&1) == 0;\n"
+	"}\n"
+	"//	batching on the GPU\n"
+	"__kernel void CreateBatchesNew( __global struct b3Contact4Data* gConstraints, __global const u32* gN, __global const u32* gStart, __global int* batchSizes, int staticIdx )\n"
+	"{\n"
+	"	int wgIdx = GET_GROUP_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	const int numConstraints = gN[wgIdx];\n"
+	"	const int m_start = gStart[wgIdx];\n"
+	"	b3Contact4Data_t tmp;\n"
+	"	\n"
+	"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+	"		\n"
+	"	\n"
+	"	\n"
+	"	\n"
+	"	\n"
+	"	if( lIdx == 0 )\n"
+	"	{\n"
+	"	\n"
+	"		\n"
+	"		__global struct b3Contact4Data* cs = &gConstraints[m_start];	\n"
+	"	\n"
+	"		\n"
+	"		int numValidConstraints = 0;\n"
+	"		int batchIdx = 0;\n"
+	"		while( numValidConstraints < numConstraints)\n"
+	"		{\n"
+	"			int nCurrentBatch = 0;\n"
+	"			//	clear flag\n"
+	"	\n"
+	"			for(int i=0; i<CHECK_SIZE; i++) \n"
+	"				ldsFixedBuffer[i] = 0;		\n"
+	"			for(int i=numValidConstraints; i<numConstraints; i++)\n"
+	"			{\n"
+	"				int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
+	"				int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
+	"				int bodyA = abs(bodyAS);\n"
+	"				int bodyB = abs(bodyBS);\n"
+	"				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n"
+	"				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n"
+	"				int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n"
+	"				int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n"
+	"				\n"
+	"				if( aUnavailable==0 && bUnavailable==0 ) // ok\n"
+	"				{\n"
+	"					if (!aIsStatic)\n"
+	"					{\n"
+	"						writeBuf( ldsFixedBuffer, bodyA );\n"
+	"					}\n"
+	"					if (!bIsStatic)\n"
+	"					{\n"
+	"						writeBuf( ldsFixedBuffer, bodyB );\n"
+	"					}\n"
+	"					cs[i].m_batchIdx = batchIdx;\n"
+	"					if (i!=numValidConstraints)\n"
+	"					{\n"
+	"						tmp = cs[i];\n"
+	"						cs[i] = cs[numValidConstraints];\n"
+	"						cs[numValidConstraints]  = tmp;\n"
+	"					}\n"
+	"					numValidConstraints++;\n"
+	"					\n"
+	"					nCurrentBatch++;\n"
+	"					if( nCurrentBatch == SIMD_WIDTH)\n"
+	"					{\n"
+	"						nCurrentBatch = 0;\n"
+	"						for(int i=0; i<CHECK_SIZE; i++) \n"
+	"							ldsFixedBuffer[i] = 0;\n"
+	"						\n"
+	"					}\n"
+	"				}\n"
+	"			}//for\n"
+	"			batchIdx ++;\n"
+	"		}//while\n"
+	"		\n"
+	"		batchSizes[wgIdx] = batchIdx;\n"
+	"	}//if( lIdx == 0 )\n"
+	"	\n"
+	"	//return batchIdx;\n"
+	"}\n";
--- a/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
@@ -1,433 +1,432 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* integrateKernelCL= \
-"/*\n"
-"Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Erwin Coumans\n"
-"#ifndef B3_RIGIDBODY_DATA_H\n"
-"#define B3_RIGIDBODY_DATA_H\n"
-"#ifndef B3_FLOAT4_H\n"
-"#define B3_FLOAT4_H\n"
-"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
-"#define B3_PLATFORM_DEFINITIONS_H\n"
-"struct MyTest\n"
-"{\n"
-"	int bla;\n"
-"};\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
-"#define B3_LARGE_FLOAT 1e18f\n"
-"#define B3_INFINITY 1e18f\n"
-"#define b3Assert(a)\n"
-"#define b3ConstArray(a) __global const a*\n"
-"#define b3AtomicInc atomic_inc\n"
-"#define b3AtomicAdd atomic_add\n"
-"#define b3Fabs fabs\n"
-"#define b3Sqrt native_sqrt\n"
-"#define b3Sin native_sin\n"
-"#define b3Cos native_cos\n"
-"#define B3_STATIC\n"
-"#endif\n"
-"#endif\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"	typedef float4	b3Float4;\n"
-"	#define b3Float4ConstArg const b3Float4\n"
-"	#define b3MakeFloat4 (float4)\n"
-"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return dot(a1, b1);\n"
-"	}\n"
-"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return cross(a1, b1);\n"
-"	}\n"
-"	#define b3MinFloat4 min\n"
-"	#define b3MaxFloat4 max\n"
-"	#define b3Normalized(a) normalize(a)\n"
-"#endif \n"
-"		\n"
-"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
-"{\n"
-"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
-"		return false;\n"
-"	return true;\n"
-"}\n"
-"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
-"{\n"
-"    float maxDot = -B3_INFINITY;\n"
-"    int i = 0;\n"
-"    int ptIndex = -1;\n"
-"    for( i = 0; i < vecLen; i++ )\n"
-"    {\n"
-"        float dot = b3Dot3F4(vecArray[i],vec);\n"
-"            \n"
-"        if( dot > maxDot )\n"
-"        {\n"
-"            maxDot = dot;\n"
-"            ptIndex = i;\n"
-"        }\n"
-"    }\n"
-"	b3Assert(ptIndex>=0);\n"
-"    if (ptIndex<0)\n"
-"	{\n"
-"		ptIndex = 0;\n"
-"	}\n"
-"    *dotOut = maxDot;\n"
-"    return ptIndex;\n"
-"}\n"
-"#endif //B3_FLOAT4_H\n"
-"#ifndef B3_QUAT_H\n"
-"#define B3_QUAT_H\n"
-"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif\n"
-"#endif\n"
-"#ifndef B3_FLOAT4_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif \n"
-"#endif //B3_FLOAT4_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"	typedef float4	b3Quat;\n"
-"	#define b3QuatConstArg const b3Quat\n"
-"	\n"
-"	\n"
-"inline float4 b3FastNormalize4(float4 v)\n"
-"{\n"
-"	v = (float4)(v.xyz,0.f);\n"
-"	return fast_normalize(v);\n"
-"}\n"
-"	\n"
-"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
-"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
-"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
-"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
-"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
-"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
-"{\n"
-"	b3Quat ans;\n"
-"	ans = b3Cross3( a, b );\n"
-"	ans += a.w*b+b.w*a;\n"
-"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
-"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
-"	return ans;\n"
-"}\n"
-"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
-"{\n"
-"	b3Quat q;\n"
-"	q=in;\n"
-"	//return b3FastNormalize4(in);\n"
-"	float len = native_sqrt(dot(q, q));\n"
-"	if(len > 0.f)\n"
-"	{\n"
-"		q *= 1.f / len;\n"
-"	}\n"
-"	else\n"
-"	{\n"
-"		q.x = q.y = q.z = 0.f;\n"
-"		q.w = 1.f;\n"
-"	}\n"
-"	return q;\n"
-"}\n"
-"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
-"{\n"
-"	b3Quat qInv = b3QuatInvert( q );\n"
-"	float4 vcpy = vec;\n"
-"	vcpy.w = 0.f;\n"
-"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
-"	return out;\n"
-"}\n"
-"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
-"{\n"
-"	return (b3Quat)(-q.xyz, q.w);\n"
-"}\n"
-"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
-"{\n"
-"	return (b3Quat)(-q.xyz, q.w);\n"
-"}\n"
-"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
-"{\n"
-"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
-"}\n"
-"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
-"{\n"
-"	return b3QuatRotate( orientation, point ) + (translation);\n"
-"}\n"
-"	\n"
-"#endif \n"
-"#endif //B3_QUAT_H\n"
-"#ifndef B3_MAT3x3_H\n"
-"#define B3_MAT3x3_H\n"
-"#ifndef B3_QUAT_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif \n"
-"#endif //B3_QUAT_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"typedef struct\n"
-"{\n"
-"	b3Float4 m_row[3];\n"
-"}b3Mat3x3;\n"
-"#define b3Mat3x3ConstArg const b3Mat3x3\n"
-"#define b3GetRow(m,row) (m.m_row[row])\n"
-"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
-"{\n"
-"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
-"	b3Mat3x3 out;\n"
-"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
-"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
-"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
-"	out.m_row[0].w = 0.f;\n"
-"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
-"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
-"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
-"	out.m_row[1].w = 0.f;\n"
-"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
-"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
-"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
-"	out.m_row[2].w = 0.f;\n"
-"	return out;\n"
-"}\n"
-"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
-"{\n"
-"	b3Mat3x3 out;\n"
-"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
-"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
-"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
-"	return out;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtZero();\n"
-"__inline\n"
-"b3Mat3x3 mtIdentity();\n"
-"__inline\n"
-"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
-"__inline\n"
-"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
-"__inline\n"
-"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
-"__inline\n"
-"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
-"__inline\n"
-"b3Mat3x3 mtZero()\n"
-"{\n"
-"	b3Mat3x3 m;\n"
-"	m.m_row[0] = (b3Float4)(0.f);\n"
-"	m.m_row[1] = (b3Float4)(0.f);\n"
-"	m.m_row[2] = (b3Float4)(0.f);\n"
-"	return m;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtIdentity()\n"
-"{\n"
-"	b3Mat3x3 m;\n"
-"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
-"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
-"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
-"	return m;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
-"{\n"
-"	b3Mat3x3 out;\n"
-"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
-"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
-"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
-"	return out;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
-"{\n"
-"	b3Mat3x3 transB;\n"
-"	transB = mtTranspose( b );\n"
-"	b3Mat3x3 ans;\n"
-"	//	why this doesn't run when 0ing in the for{}\n"
-"	a.m_row[0].w = 0.f;\n"
-"	a.m_row[1].w = 0.f;\n"
-"	a.m_row[2].w = 0.f;\n"
-"	for(int i=0; i<3; i++)\n"
-"	{\n"
-"//	a.m_row[i].w = 0.f;\n"
-"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
-"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
-"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
-"		ans.m_row[i].w = 0.f;\n"
-"	}\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
-"{\n"
-"	b3Float4 ans;\n"
-"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
-"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
-"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
-"	ans.w = 0.f;\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
-"{\n"
-"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
-"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
-"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"	b3Float4 ans;\n"
-"	ans.x = b3Dot3F4( a, colx );\n"
-"	ans.y = b3Dot3F4( a, coly );\n"
-"	ans.z = b3Dot3F4( a, colz );\n"
-"	return ans;\n"
-"}\n"
-"#endif\n"
-"#endif //B3_MAT3x3_H\n"
-"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
-"struct b3RigidBodyData\n"
-"{\n"
-"	b3Float4				m_pos;\n"
-"	b3Quat					m_quat;\n"
-"	b3Float4				m_linVel;\n"
-"	b3Float4				m_angVel;\n"
-"	int 					m_collidableIdx;\n"
-"	float 				m_invMass;\n"
-"	float 				m_restituitionCoeff;\n"
-"	float 				m_frictionCoeff;\n"
-"};\n"
-"typedef struct b3InertiaData b3InertiaData_t;\n"
-"struct b3InertiaData\n"
-"{\n"
-"	b3Mat3x3 m_invInertiaWorld;\n"
-"	b3Mat3x3 m_initInvInertia;\n"
-"};\n"
-"#endif //B3_RIGIDBODY_DATA_H\n"
-"	\n"
-"#ifndef B3_RIGIDBODY_DATA_H\n"
-"#endif //B3_RIGIDBODY_DATA_H\n"
-"	\n"
-"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
-"{\n"
-"	\n"
-"	if (bodies[nodeID].m_invMass != 0.f)\n"
-"	{\n"
-"		float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
-"		//angular velocity\n"
-"		{\n"
-"			b3Float4 axis;\n"
-"			//add some hardcoded angular damping\n"
-"			bodies[nodeID].m_angVel.x *= angularDamping;\n"
-"			bodies[nodeID].m_angVel.y *= angularDamping;\n"
-"			bodies[nodeID].m_angVel.z *= angularDamping;\n"
-"			\n"
-"			b3Float4 angvel = bodies[nodeID].m_angVel;\n"
-"			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
-"			\n"
-"			//limit the angular motion\n"
-"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
-"			{\n"
-"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
-"			}\n"
-"			if(fAngle < 0.001f)\n"
-"			{\n"
-"				// use Taylor's expansions of sync function\n"
-"				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
-"			}\n"
-"			else\n"
-"			{\n"
-"				// sync(fAngle) = sin(c*fAngle)/t\n"
-"				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
-"			}\n"
-"			\n"
-"			b3Quat dorn;\n"
-"			dorn.x = axis.x;\n"
-"			dorn.y = axis.y;\n"
-"			dorn.z = axis.z;\n"
-"			dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
-"			b3Quat orn0 = bodies[nodeID].m_quat;\n"
-"			b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
-"			predictedOrn = b3QuatNormalized(predictedOrn);\n"
-"			bodies[nodeID].m_quat=predictedOrn;\n"
-"		}\n"
-"		//linear velocity		\n"
-"		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;\n"
-"		\n"
-"		//apply gravity\n"
-"		bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n"
-"		\n"
-"	}\n"
-"	\n"
-"}\n"
-"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
-"{\n"
-"	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
-"	\n"
-"	if( (body->m_invMass != 0.f))\n"
-"	{\n"
-"		//angular velocity\n"
-"		{\n"
-"			b3Float4 axis;\n"
-"			//add some hardcoded angular damping\n"
-"			body->m_angVel.x *= angularDamping;\n"
-"			body->m_angVel.y *= angularDamping;\n"
-"			body->m_angVel.z *= angularDamping;\n"
-"			\n"
-"			b3Float4 angvel = body->m_angVel;\n"
-"			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
-"			//limit the angular motion\n"
-"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
-"			{\n"
-"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
-"			}\n"
-"			if(fAngle < 0.001f)\n"
-"			{\n"
-"				// use Taylor's expansions of sync function\n"
-"				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
-"			}\n"
-"			else\n"
-"			{\n"
-"				// sync(fAngle) = sin(c*fAngle)/t\n"
-"				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
-"			}\n"
-"			b3Quat dorn;\n"
-"			dorn.x = axis.x;\n"
-"			dorn.y = axis.y;\n"
-"			dorn.z = axis.z;\n"
-"			dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
-"			b3Quat orn0 = body->m_quat;\n"
-"			b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
-"			predictedOrn = b3QuatNormalized(predictedOrn);\n"
-"			body->m_quat=predictedOrn;\n"
-"		}\n"
-"		//apply gravity\n"
-"		body->m_linVel += gravityAcceleration * timeStep;\n"
-"		//linear velocity		\n"
-"		body->m_pos +=  body->m_linVel * timeStep;\n"
-"		\n"
-"	}\n"
-"	\n"
-"}\n"
-"__kernel void \n"
-"  integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
-"{\n"
-"	int nodeID = get_global_id(0);\n"
-"	\n"
-"	if( nodeID < numNodes)\n"
-"	{\n"
-"		integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n"
-"	}\n"
-"}\n"
-;
+static const char* integrateKernelCL =
+	"/*\n"
+	"Copyright (c) 2013 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Erwin Coumans\n"
+	"#ifndef B3_RIGIDBODY_DATA_H\n"
+	"#define B3_RIGIDBODY_DATA_H\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#define B3_FLOAT4_H\n"
+	"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+	"#define B3_PLATFORM_DEFINITIONS_H\n"
+	"struct MyTest\n"
+	"{\n"
+	"	int bla;\n"
+	"};\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+	"#define B3_LARGE_FLOAT 1e18f\n"
+	"#define B3_INFINITY 1e18f\n"
+	"#define b3Assert(a)\n"
+	"#define b3ConstArray(a) __global const a*\n"
+	"#define b3AtomicInc atomic_inc\n"
+	"#define b3AtomicAdd atomic_add\n"
+	"#define b3Fabs fabs\n"
+	"#define b3Sqrt native_sqrt\n"
+	"#define b3Sin native_sin\n"
+	"#define b3Cos native_cos\n"
+	"#define B3_STATIC\n"
+	"#endif\n"
+	"#endif\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"	typedef float4	b3Float4;\n"
+	"	#define b3Float4ConstArg const b3Float4\n"
+	"	#define b3MakeFloat4 (float4)\n"
+	"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return dot(a1, b1);\n"
+	"	}\n"
+	"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return cross(a1, b1);\n"
+	"	}\n"
+	"	#define b3MinFloat4 min\n"
+	"	#define b3MaxFloat4 max\n"
+	"	#define b3Normalized(a) normalize(a)\n"
+	"#endif \n"
+	"		\n"
+	"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+	"{\n"
+	"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+	"		return false;\n"
+	"	return true;\n"
+	"}\n"
+	"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+	"{\n"
+	"    float maxDot = -B3_INFINITY;\n"
+	"    int i = 0;\n"
+	"    int ptIndex = -1;\n"
+	"    for( i = 0; i < vecLen; i++ )\n"
+	"    {\n"
+	"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+	"            \n"
+	"        if( dot > maxDot )\n"
+	"        {\n"
+	"            maxDot = dot;\n"
+	"            ptIndex = i;\n"
+	"        }\n"
+	"    }\n"
+	"	b3Assert(ptIndex>=0);\n"
+	"    if (ptIndex<0)\n"
+	"	{\n"
+	"		ptIndex = 0;\n"
+	"	}\n"
+	"    *dotOut = maxDot;\n"
+	"    return ptIndex;\n"
+	"}\n"
+	"#endif //B3_FLOAT4_H\n"
+	"#ifndef B3_QUAT_H\n"
+	"#define B3_QUAT_H\n"
+	"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif\n"
+	"#endif\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif \n"
+	"#endif //B3_FLOAT4_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"	typedef float4	b3Quat;\n"
+	"	#define b3QuatConstArg const b3Quat\n"
+	"	\n"
+	"	\n"
+	"inline float4 b3FastNormalize4(float4 v)\n"
+	"{\n"
+	"	v = (float4)(v.xyz,0.f);\n"
+	"	return fast_normalize(v);\n"
+	"}\n"
+	"	\n"
+	"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+	"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+	"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+	"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+	"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+	"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+	"{\n"
+	"	b3Quat ans;\n"
+	"	ans = b3Cross3( a, b );\n"
+	"	ans += a.w*b+b.w*a;\n"
+	"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+	"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+	"	return ans;\n"
+	"}\n"
+	"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+	"{\n"
+	"	b3Quat q;\n"
+	"	q=in;\n"
+	"	//return b3FastNormalize4(in);\n"
+	"	float len = native_sqrt(dot(q, q));\n"
+	"	if(len > 0.f)\n"
+	"	{\n"
+	"		q *= 1.f / len;\n"
+	"	}\n"
+	"	else\n"
+	"	{\n"
+	"		q.x = q.y = q.z = 0.f;\n"
+	"		q.w = 1.f;\n"
+	"	}\n"
+	"	return q;\n"
+	"}\n"
+	"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+	"{\n"
+	"	b3Quat qInv = b3QuatInvert( q );\n"
+	"	float4 vcpy = vec;\n"
+	"	vcpy.w = 0.f;\n"
+	"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+	"	return out;\n"
+	"}\n"
+	"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+	"{\n"
+	"	return (b3Quat)(-q.xyz, q.w);\n"
+	"}\n"
+	"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+	"{\n"
+	"	return (b3Quat)(-q.xyz, q.w);\n"
+	"}\n"
+	"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+	"{\n"
+	"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+	"}\n"
+	"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+	"{\n"
+	"	return b3QuatRotate( orientation, point ) + (translation);\n"
+	"}\n"
+	"	\n"
+	"#endif \n"
+	"#endif //B3_QUAT_H\n"
+	"#ifndef B3_MAT3x3_H\n"
+	"#define B3_MAT3x3_H\n"
+	"#ifndef B3_QUAT_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif \n"
+	"#endif //B3_QUAT_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"typedef struct\n"
+	"{\n"
+	"	b3Float4 m_row[3];\n"
+	"}b3Mat3x3;\n"
+	"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+	"#define b3GetRow(m,row) (m.m_row[row])\n"
+	"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+	"{\n"
+	"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+	"	b3Mat3x3 out;\n"
+	"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+	"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+	"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+	"	out.m_row[0].w = 0.f;\n"
+	"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+	"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+	"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+	"	out.m_row[1].w = 0.f;\n"
+	"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+	"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+	"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+	"	out.m_row[2].w = 0.f;\n"
+	"	return out;\n"
+	"}\n"
+	"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+	"{\n"
+	"	b3Mat3x3 out;\n"
+	"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+	"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+	"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+	"	return out;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtZero();\n"
+	"__inline\n"
+	"b3Mat3x3 mtIdentity();\n"
+	"__inline\n"
+	"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+	"__inline\n"
+	"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+	"__inline\n"
+	"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+	"__inline\n"
+	"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+	"__inline\n"
+	"b3Mat3x3 mtZero()\n"
+	"{\n"
+	"	b3Mat3x3 m;\n"
+	"	m.m_row[0] = (b3Float4)(0.f);\n"
+	"	m.m_row[1] = (b3Float4)(0.f);\n"
+	"	m.m_row[2] = (b3Float4)(0.f);\n"
+	"	return m;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtIdentity()\n"
+	"{\n"
+	"	b3Mat3x3 m;\n"
+	"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+	"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+	"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+	"	return m;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+	"{\n"
+	"	b3Mat3x3 out;\n"
+	"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+	"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+	"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+	"	return out;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+	"{\n"
+	"	b3Mat3x3 transB;\n"
+	"	transB = mtTranspose( b );\n"
+	"	b3Mat3x3 ans;\n"
+	"	//	why this doesn't run when 0ing in the for{}\n"
+	"	a.m_row[0].w = 0.f;\n"
+	"	a.m_row[1].w = 0.f;\n"
+	"	a.m_row[2].w = 0.f;\n"
+	"	for(int i=0; i<3; i++)\n"
+	"	{\n"
+	"//	a.m_row[i].w = 0.f;\n"
+	"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+	"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+	"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+	"		ans.m_row[i].w = 0.f;\n"
+	"	}\n"
+	"	return ans;\n"
+	"}\n"
+	"__inline\n"
+	"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+	"{\n"
+	"	b3Float4 ans;\n"
+	"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+	"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+	"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+	"	ans.w = 0.f;\n"
+	"	return ans;\n"
+	"}\n"
+	"__inline\n"
+	"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+	"{\n"
+	"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+	"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+	"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+	"	b3Float4 ans;\n"
+	"	ans.x = b3Dot3F4( a, colx );\n"
+	"	ans.y = b3Dot3F4( a, coly );\n"
+	"	ans.z = b3Dot3F4( a, colz );\n"
+	"	return ans;\n"
+	"}\n"
+	"#endif\n"
+	"#endif //B3_MAT3x3_H\n"
+	"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+	"struct b3RigidBodyData\n"
+	"{\n"
+	"	b3Float4				m_pos;\n"
+	"	b3Quat					m_quat;\n"
+	"	b3Float4				m_linVel;\n"
+	"	b3Float4				m_angVel;\n"
+	"	int 					m_collidableIdx;\n"
+	"	float 				m_invMass;\n"
+	"	float 				m_restituitionCoeff;\n"
+	"	float 				m_frictionCoeff;\n"
+	"};\n"
+	"typedef struct b3InertiaData b3InertiaData_t;\n"
+	"struct b3InertiaData\n"
+	"{\n"
+	"	b3Mat3x3 m_invInertiaWorld;\n"
+	"	b3Mat3x3 m_initInvInertia;\n"
+	"};\n"
+	"#endif //B3_RIGIDBODY_DATA_H\n"
+	"	\n"
+	"#ifndef B3_RIGIDBODY_DATA_H\n"
+	"#endif //B3_RIGIDBODY_DATA_H\n"
+	"	\n"
+	"inline void integrateSingleTransform( __global b3RigidBodyData_t* bodies,int nodeID, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
+	"{\n"
+	"	\n"
+	"	if (bodies[nodeID].m_invMass != 0.f)\n"
+	"	{\n"
+	"		float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
+	"		//angular velocity\n"
+	"		{\n"
+	"			b3Float4 axis;\n"
+	"			//add some hardcoded angular damping\n"
+	"			bodies[nodeID].m_angVel.x *= angularDamping;\n"
+	"			bodies[nodeID].m_angVel.y *= angularDamping;\n"
+	"			bodies[nodeID].m_angVel.z *= angularDamping;\n"
+	"			\n"
+	"			b3Float4 angvel = bodies[nodeID].m_angVel;\n"
+	"			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
+	"			\n"
+	"			//limit the angular motion\n"
+	"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
+	"			{\n"
+	"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
+	"			}\n"
+	"			if(fAngle < 0.001f)\n"
+	"			{\n"
+	"				// use Taylor's expansions of sync function\n"
+	"				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
+	"			}\n"
+	"			else\n"
+	"			{\n"
+	"				// sync(fAngle) = sin(c*fAngle)/t\n"
+	"				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
+	"			}\n"
+	"			\n"
+	"			b3Quat dorn;\n"
+	"			dorn.x = axis.x;\n"
+	"			dorn.y = axis.y;\n"
+	"			dorn.z = axis.z;\n"
+	"			dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
+	"			b3Quat orn0 = bodies[nodeID].m_quat;\n"
+	"			b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
+	"			predictedOrn = b3QuatNormalized(predictedOrn);\n"
+	"			bodies[nodeID].m_quat=predictedOrn;\n"
+	"		}\n"
+	"		//linear velocity		\n"
+	"		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;\n"
+	"		\n"
+	"		//apply gravity\n"
+	"		bodies[nodeID].m_linVel += gravityAcceleration * timeStep;\n"
+	"		\n"
+	"	}\n"
+	"	\n"
+	"}\n"
+	"inline void b3IntegrateTransform( __global b3RigidBodyData_t* body, float timeStep, float angularDamping, b3Float4ConstArg gravityAcceleration)\n"
+	"{\n"
+	"	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
+	"	\n"
+	"	if( (body->m_invMass != 0.f))\n"
+	"	{\n"
+	"		//angular velocity\n"
+	"		{\n"
+	"			b3Float4 axis;\n"
+	"			//add some hardcoded angular damping\n"
+	"			body->m_angVel.x *= angularDamping;\n"
+	"			body->m_angVel.y *= angularDamping;\n"
+	"			body->m_angVel.z *= angularDamping;\n"
+	"			\n"
+	"			b3Float4 angvel = body->m_angVel;\n"
+	"			float fAngle = b3Sqrt(b3Dot3F4(angvel, angvel));\n"
+	"			//limit the angular motion\n"
+	"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
+	"			{\n"
+	"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
+	"			}\n"
+	"			if(fAngle < 0.001f)\n"
+	"			{\n"
+	"				// use Taylor's expansions of sync function\n"
+	"				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
+	"			}\n"
+	"			else\n"
+	"			{\n"
+	"				// sync(fAngle) = sin(c*fAngle)/t\n"
+	"				axis = angvel * ( b3Sin(0.5f * fAngle * timeStep) / fAngle);\n"
+	"			}\n"
+	"			b3Quat dorn;\n"
+	"			dorn.x = axis.x;\n"
+	"			dorn.y = axis.y;\n"
+	"			dorn.z = axis.z;\n"
+	"			dorn.w = b3Cos(fAngle * timeStep * 0.5f);\n"
+	"			b3Quat orn0 = body->m_quat;\n"
+	"			b3Quat predictedOrn = b3QuatMul(dorn, orn0);\n"
+	"			predictedOrn = b3QuatNormalized(predictedOrn);\n"
+	"			body->m_quat=predictedOrn;\n"
+	"		}\n"
+	"		//apply gravity\n"
+	"		body->m_linVel += gravityAcceleration * timeStep;\n"
+	"		//linear velocity		\n"
+	"		body->m_pos +=  body->m_linVel * timeStep;\n"
+	"		\n"
+	"	}\n"
+	"	\n"
+	"}\n"
+	"__kernel void \n"
+	"  integrateTransformsKernel( __global b3RigidBodyData_t* bodies,const int numNodes, float timeStep, float angularDamping, float4 gravityAcceleration)\n"
+	"{\n"
+	"	int nodeID = get_global_id(0);\n"
+	"	\n"
+	"	if( nodeID < numNodes)\n"
+	"	{\n"
+	"		integrateSingleTransform(bodies,nodeID, timeStep, angularDamping,gravityAcceleration);\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/jointSolver.h
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveContact.h
@@ -1,393 +1,392 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* solveContactCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"#ifdef cl_ext_atomic_counters_32\n"
-"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
-"#else\n"
-"#define counter32_t volatile global int*\n"
-"#endif\n"
-"typedef unsigned int u32;\n"
-"typedef unsigned short u16;\n"
-"typedef unsigned char u8;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GET_NUM_GROUPS get_num_groups(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define AppendInc(x, out) out = atomic_inc(x)\n"
-"#define AtomAdd(x, value) atom_add(&(x), value)\n"
-"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
-"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"#define mymake_float4 (float4)\n"
-"//#define make_float2 (float2)\n"
-"//#define make_uint4 (uint4)\n"
-"//#define make_int4 (int4)\n"
-"//#define make_uint2 (uint2)\n"
-"//#define make_int2 (int2)\n"
-"#define max2 max\n"
-"#define min2 min\n"
-"///////////////////////////////////////\n"
-"//	Vector\n"
-"///////////////////////////////////////\n"
-"__inline\n"
-"float4 fastNormalize4(float4 v)\n"
-"{\n"
-"	return fast_normalize(v);\n"
-"}\n"
-"__inline\n"
-"float4 cross3(float4 a, float4 b)\n"
-"{\n"
-"	return cross(a,b);\n"
-"}\n"
-"__inline\n"
-"float dot3F4(float4 a, float4 b)\n"
-"{\n"
-"	float4 a1 = mymake_float4(a.xyz,0.f);\n"
-"	float4 b1 = mymake_float4(b.xyz,0.f);\n"
-"	return dot(a1, b1);\n"
-"}\n"
-"__inline\n"
-"float4 normalize3(const float4 a)\n"
-"{\n"
-"	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
-"	return fastNormalize4( n );\n"
-"//	float length = sqrtf(dot3F4(a, a));\n"
-"//	return 1.f/length * a;\n"
-"}\n"
-"///////////////////////////////////////\n"
-"//	Matrix3x3\n"
-"///////////////////////////////////////\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_row[3];\n"
-"}Matrix3x3;\n"
-"__inline\n"
-"float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"__inline\n"
-"float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"__inline\n"
-"float4 mtMul1(Matrix3x3 a, float4 b)\n"
-"{\n"
-"	float4 ans;\n"
-"	ans.x = dot3F4( a.m_row[0], b );\n"
-"	ans.y = dot3F4( a.m_row[1], b );\n"
-"	ans.z = dot3F4( a.m_row[2], b );\n"
-"	ans.w = 0.f;\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"float4 mtMul3(float4 a, Matrix3x3 b)\n"
-"{\n"
-"	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
-"	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
-"	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"	float4 ans;\n"
-"	ans.x = dot3F4( a, colx );\n"
-"	ans.y = dot3F4( a, coly );\n"
-"	ans.z = dot3F4( a, colz );\n"
-"	return ans;\n"
-"}\n"
-"///////////////////////////////////////\n"
-"//	Quaternion\n"
-"///////////////////////////////////////\n"
-"typedef float4 Quaternion;\n"
-"#define WG_SIZE 64\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_pos;\n"
-"	Quaternion m_quat;\n"
-"	float4 m_linVel;\n"
-"	float4 m_angVel;\n"
-"	u32 m_shapeIdx;\n"
-"	float m_invMass;\n"
-"	float m_restituitionCoeff;\n"
-"	float m_frictionCoeff;\n"
-"} Body;\n"
-"typedef struct\n"
-"{\n"
-"	Matrix3x3 m_invInertia;\n"
-"	Matrix3x3 m_initInvInertia;\n"
-"} Shape;\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_linear;\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_center;	\n"
-"	float m_jacCoeffInv[4];\n"
-"	float m_b[4];\n"
-"	float m_appliedRambdaDt[4];\n"
-"	float m_fJacCoeffInv[2];	\n"
-"	float m_fAppliedRambdaDt[2];	\n"
-"	u32 m_bodyA;\n"
-"	u32 m_bodyB;\n"
-"	int m_batchIdx;\n"
-"	u32 m_paddings[1];\n"
-"} Constraint4;\n"
-"typedef struct\n"
-"{\n"
-"	int m_nConstraints;\n"
-"	int m_start;\n"
-"	int m_batchIdx;\n"
-"	int m_nSplit;\n"
-"//	int m_paddings[1];\n"
-"} ConstBuffer;\n"
-"typedef struct\n"
-"{\n"
-"	int m_solveFriction;\n"
-"	int m_maxBatch;	//	long batch really kills the performance\n"
-"	int m_batchIdx;\n"
-"	int m_nSplit;\n"
-"//	int m_paddings[1];\n"
-"} ConstBufferBatchSolve;\n"
-"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
-"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
-"{\n"
-"	*linear = mymake_float4(-n.xyz,0.f);\n"
-"	*angular0 = -cross3(r0, n);\n"
-"	*angular1 = cross3(r1, n);\n"
-"}\n"
-"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
-"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
-"{\n"
-"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
-"}\n"
-"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
-"				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
-"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
-"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
-"{\n"
-"	//	linear0,1 are normlized\n"
-"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
-"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
-"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
-"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
-"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
-"}\n"
-"void solveContact(__global Constraint4* cs,\n"
-"				  float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
-"				  float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
-"void solveContact(__global Constraint4* cs,\n"
-"			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
-"			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
-"{\n"
-"	float minRambdaDt = 0;\n"
-"	float maxRambdaDt = FLT_MAX;\n"
-"	for(int ic=0; ic<4; ic++)\n"
-"	{\n"
-"		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
-"		float4 angular0, angular1, linear;\n"
-"		float4 r0 = cs->m_worldPos[ic] - posA;\n"
-"		float4 r1 = cs->m_worldPos[ic] - posB;\n"
-"		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
-"		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
-"			*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
-"		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
-"		{\n"
-"			float prevSum = cs->m_appliedRambdaDt[ic];\n"
-"			float updated = prevSum;\n"
-"			updated += rambdaDt;\n"
-"			updated = max2( updated, minRambdaDt );\n"
-"			updated = min2( updated, maxRambdaDt );\n"
-"			rambdaDt = updated - prevSum;\n"
-"			cs->m_appliedRambdaDt[ic] = updated;\n"
-"		}\n"
-"		float4 linImp0 = invMassA*linear*rambdaDt;\n"
-"		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
-"		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
-"		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
-"		*linVelA += linImp0;\n"
-"		*angVelA += angImp0;\n"
-"		*linVelB += linImp1;\n"
-"		*angVelB += angImp1;\n"
-"	}\n"
-"}\n"
-"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
-" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
-"{\n"
-"  if (fabs(n[0].z) > 0.70710678f) {\n"
-"    // choose p in y-z plane\n"
-"    float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
-"    float k = 1.f/sqrt(a);\n"
-"    p[0].x = 0;\n"
-"	p[0].y = -n[0].z*k;\n"
-"	p[0].z = n[0].y*k;\n"
-"    // set q = n x p\n"
-"    q[0].x = a*k;\n"
-"	q[0].y = -n[0].x*p[0].z;\n"
-"	q[0].z = n[0].x*p[0].y;\n"
-"  }\n"
-"  else {\n"
-"    // choose p in x-y plane\n"
-"    float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
-"    float k = 1.f/sqrt(a);\n"
-"    p[0].x = -n[0].y*k;\n"
-"	p[0].y = n[0].x*k;\n"
-"	p[0].z = 0;\n"
-"    // set q = n x p\n"
-"    q[0].x = -n[0].z*p[0].y;\n"
-"	q[0].y = n[0].z*p[0].x;\n"
-"	q[0].z = a*k;\n"
-"  }\n"
-"}\n"
-"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
-"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
-"{\n"
-"	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
-"	int aIdx = ldsCs[0].m_bodyA;\n"
-"	int bIdx = ldsCs[0].m_bodyB;\n"
-"	float4 posA = gBodies[aIdx].m_pos;\n"
-"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
-"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
-"	float invMassA = gBodies[aIdx].m_invMass;\n"
-"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"	float4 posB = gBodies[bIdx].m_pos;\n"
-"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
-"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
-"	float invMassB = gBodies[bIdx].m_invMass;\n"
-"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
-"			posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
-"  if (gBodies[aIdx].m_invMass)\n"
-"  {\n"
-"		gBodies[aIdx].m_linVel = linVelA;\n"
-"		gBodies[aIdx].m_angVel = angVelA;\n"
-"	} else\n"
-"	{\n"
-"		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
-"		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
-"	\n"
-"	}\n"
-"	if (gBodies[bIdx].m_invMass)\n"
-"  {\n"
-"		gBodies[bIdx].m_linVel = linVelB;\n"
-"		gBodies[bIdx].m_angVel = angVelB;\n"
-"	} else\n"
-"	{\n"
-"		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
-"		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
-"	\n"
-"	}\n"
-"}\n"
-"typedef struct \n"
-"{\n"
-"	int m_valInt0;\n"
-"	int m_valInt1;\n"
-"	int m_valInt2;\n"
-"	int m_valInt3;\n"
-"	float m_val0;\n"
-"	float m_val1;\n"
-"	float m_val2;\n"
-"	float m_val3;\n"
-"} SolverDebugInfo;\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void BatchSolveKernelContact(__global Body* gBodies,\n"
-"                      __global Shape* gShapes,\n"
-"                      __global Constraint4* gConstraints,\n"
-"                      __global int* gN,\n"
-"                      __global int* gOffsets,\n"
-"                      __global	int* batchSizes,\n"
-"                       int maxBatch1,\n"
-"                       int cellBatch,\n"
-"                       int4 nSplit\n"
-"                      )\n"
-"{\n"
-"	//__local int ldsBatchIdx[WG_SIZE+1];\n"
-"	__local int ldsCurBatch;\n"
-"	__local int ldsNextBatch;\n"
-"	__local int ldsStart;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int wgIdx = GET_GROUP_IDX;\n"
-"//	int gIdx = GET_GLOBAL_IDX;\n"
-"//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
-"	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
-"	\n"
-"	\n"
-"	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
-"	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
-"	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
-"	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
-"	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
-"	//int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
-"	//int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
-"	//int cellIdx = xIdx+yIdx*nSplit;\n"
-"	\n"
-"	if( gN[cellIdx] == 0 ) \n"
-"		return;\n"
-"	int maxBatch = batchSizes[cellIdx];\n"
-"	\n"
-"	\n"
-"	const int start = gOffsets[cellIdx];\n"
-"	const int end = start + gN[cellIdx];\n"
-"	\n"
-"	\n"
-"	\n"
-"	if( lIdx == 0 )\n"
-"	{\n"
-"		ldsCurBatch = 0;\n"
-"		ldsNextBatch = 0;\n"
-"		ldsStart = start;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	int idx=ldsStart+lIdx;\n"
-"	while (ldsCurBatch < maxBatch)\n"
-"	{\n"
-"		for(; idx<end; )\n"
-"		{\n"
-"			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
-"			{\n"
-"					solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"				 idx+=64;\n"
-"			} else\n"
-"			{\n"
-"				break;\n"
-"			}\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"	\n"
-"		if( lIdx == 0 )\n"
-"		{\n"
-"			ldsCurBatch++;\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"	}\n"
-"	\n"
-"    \n"
-"}\n"
-"__kernel void solveSingleContactKernel(__global Body* gBodies,\n"
-"                      __global Shape* gShapes,\n"
-"                      __global Constraint4* gConstraints,\n"
-"                       int cellIdx,\n"
-"                       int batchOffset,\n"
-"                       int numConstraintsInBatch\n"
-"                      )\n"
-"{\n"
-"	int index = get_global_id(0);\n"
-"	if (index < numConstraintsInBatch)\n"
-"	{\n"
-"		int idx=batchOffset+index;\n"
-"		solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"	}    \n"
-"}\n"
-;
+static const char* solveContactCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+	"#ifdef cl_ext_atomic_counters_32\n"
+	"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+	"#else\n"
+	"#define counter32_t volatile global int*\n"
+	"#endif\n"
+	"typedef unsigned int u32;\n"
+	"typedef unsigned short u16;\n"
+	"typedef unsigned char u8;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GET_NUM_GROUPS get_num_groups(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define AppendInc(x, out) out = atomic_inc(x)\n"
+	"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+	"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+	"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+	"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+	"#define mymake_float4 (float4)\n"
+	"//#define make_float2 (float2)\n"
+	"//#define make_uint4 (uint4)\n"
+	"//#define make_int4 (int4)\n"
+	"//#define make_uint2 (uint2)\n"
+	"//#define make_int2 (int2)\n"
+	"#define max2 max\n"
+	"#define min2 min\n"
+	"///////////////////////////////////////\n"
+	"//	Vector\n"
+	"///////////////////////////////////////\n"
+	"__inline\n"
+	"float4 fastNormalize4(float4 v)\n"
+	"{\n"
+	"	return fast_normalize(v);\n"
+	"}\n"
+	"__inline\n"
+	"float4 cross3(float4 a, float4 b)\n"
+	"{\n"
+	"	return cross(a,b);\n"
+	"}\n"
+	"__inline\n"
+	"float dot3F4(float4 a, float4 b)\n"
+	"{\n"
+	"	float4 a1 = mymake_float4(a.xyz,0.f);\n"
+	"	float4 b1 = mymake_float4(b.xyz,0.f);\n"
+	"	return dot(a1, b1);\n"
+	"}\n"
+	"__inline\n"
+	"float4 normalize3(const float4 a)\n"
+	"{\n"
+	"	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
+	"	return fastNormalize4( n );\n"
+	"//	float length = sqrtf(dot3F4(a, a));\n"
+	"//	return 1.f/length * a;\n"
+	"}\n"
+	"///////////////////////////////////////\n"
+	"//	Matrix3x3\n"
+	"///////////////////////////////////////\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_row[3];\n"
+	"}Matrix3x3;\n"
+	"__inline\n"
+	"float4 mtMul1(Matrix3x3 a, float4 b);\n"
+	"__inline\n"
+	"float4 mtMul3(float4 a, Matrix3x3 b);\n"
+	"__inline\n"
+	"float4 mtMul1(Matrix3x3 a, float4 b)\n"
+	"{\n"
+	"	float4 ans;\n"
+	"	ans.x = dot3F4( a.m_row[0], b );\n"
+	"	ans.y = dot3F4( a.m_row[1], b );\n"
+	"	ans.z = dot3F4( a.m_row[2], b );\n"
+	"	ans.w = 0.f;\n"
+	"	return ans;\n"
+	"}\n"
+	"__inline\n"
+	"float4 mtMul3(float4 a, Matrix3x3 b)\n"
+	"{\n"
+	"	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+	"	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+	"	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+	"	float4 ans;\n"
+	"	ans.x = dot3F4( a, colx );\n"
+	"	ans.y = dot3F4( a, coly );\n"
+	"	ans.z = dot3F4( a, colz );\n"
+	"	return ans;\n"
+	"}\n"
+	"///////////////////////////////////////\n"
+	"//	Quaternion\n"
+	"///////////////////////////////////////\n"
+	"typedef float4 Quaternion;\n"
+	"#define WG_SIZE 64\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_pos;\n"
+	"	Quaternion m_quat;\n"
+	"	float4 m_linVel;\n"
+	"	float4 m_angVel;\n"
+	"	u32 m_shapeIdx;\n"
+	"	float m_invMass;\n"
+	"	float m_restituitionCoeff;\n"
+	"	float m_frictionCoeff;\n"
+	"} Body;\n"
+	"typedef struct\n"
+	"{\n"
+	"	Matrix3x3 m_invInertia;\n"
+	"	Matrix3x3 m_initInvInertia;\n"
+	"} Shape;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_linear;\n"
+	"	float4 m_worldPos[4];\n"
+	"	float4 m_center;	\n"
+	"	float m_jacCoeffInv[4];\n"
+	"	float m_b[4];\n"
+	"	float m_appliedRambdaDt[4];\n"
+	"	float m_fJacCoeffInv[2];	\n"
+	"	float m_fAppliedRambdaDt[2];	\n"
+	"	u32 m_bodyA;\n"
+	"	u32 m_bodyB;\n"
+	"	int m_batchIdx;\n"
+	"	u32 m_paddings[1];\n"
+	"} Constraint4;\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_nConstraints;\n"
+	"	int m_start;\n"
+	"	int m_batchIdx;\n"
+	"	int m_nSplit;\n"
+	"//	int m_paddings[1];\n"
+	"} ConstBuffer;\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_solveFriction;\n"
+	"	int m_maxBatch;	//	long batch really kills the performance\n"
+	"	int m_batchIdx;\n"
+	"	int m_nSplit;\n"
+	"//	int m_paddings[1];\n"
+	"} ConstBufferBatchSolve;\n"
+	"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
+	"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
+	"{\n"
+	"	*linear = mymake_float4(-n.xyz,0.f);\n"
+	"	*angular0 = -cross3(r0, n);\n"
+	"	*angular1 = cross3(r1, n);\n"
+	"}\n"
+	"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
+	"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
+	"{\n"
+	"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
+	"}\n"
+	"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+	"				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
+	"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+	"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
+	"{\n"
+	"	//	linear0,1 are normlized\n"
+	"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
+	"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
+	"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
+	"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
+	"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
+	"}\n"
+	"void solveContact(__global Constraint4* cs,\n"
+	"				  float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
+	"				  float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB);\n"
+	"void solveContact(__global Constraint4* cs,\n"
+	"			float4 posA, float4* linVelA, float4* angVelA, float invMassA, Matrix3x3 invInertiaA,\n"
+	"			float4 posB, float4* linVelB, float4* angVelB, float invMassB, Matrix3x3 invInertiaB)\n"
+	"{\n"
+	"	float minRambdaDt = 0;\n"
+	"	float maxRambdaDt = FLT_MAX;\n"
+	"	for(int ic=0; ic<4; ic++)\n"
+	"	{\n"
+	"		if( cs->m_jacCoeffInv[ic] == 0.f ) continue;\n"
+	"		float4 angular0, angular1, linear;\n"
+	"		float4 r0 = cs->m_worldPos[ic] - posA;\n"
+	"		float4 r1 = cs->m_worldPos[ic] - posB;\n"
+	"		setLinearAndAngular( -cs->m_linear, r0, r1, &linear, &angular0, &angular1 );\n"
+	"		float rambdaDt = calcRelVel( cs->m_linear, -cs->m_linear, angular0, angular1, \n"
+	"			*linVelA, *angVelA, *linVelB, *angVelB ) + cs->m_b[ic];\n"
+	"		rambdaDt *= cs->m_jacCoeffInv[ic];\n"
+	"		{\n"
+	"			float prevSum = cs->m_appliedRambdaDt[ic];\n"
+	"			float updated = prevSum;\n"
+	"			updated += rambdaDt;\n"
+	"			updated = max2( updated, minRambdaDt );\n"
+	"			updated = min2( updated, maxRambdaDt );\n"
+	"			rambdaDt = updated - prevSum;\n"
+	"			cs->m_appliedRambdaDt[ic] = updated;\n"
+	"		}\n"
+	"		float4 linImp0 = invMassA*linear*rambdaDt;\n"
+	"		float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
+	"		float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
+	"		float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
+	"		*linVelA += linImp0;\n"
+	"		*angVelA += angImp0;\n"
+	"		*linVelB += linImp1;\n"
+	"		*angVelB += angImp1;\n"
+	"	}\n"
+	"}\n"
+	"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
+	" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
+	"{\n"
+	"  if (fabs(n[0].z) > 0.70710678f) {\n"
+	"    // choose p in y-z plane\n"
+	"    float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
+	"    float k = 1.f/sqrt(a);\n"
+	"    p[0].x = 0;\n"
+	"	p[0].y = -n[0].z*k;\n"
+	"	p[0].z = n[0].y*k;\n"
+	"    // set q = n x p\n"
+	"    q[0].x = a*k;\n"
+	"	q[0].y = -n[0].x*p[0].z;\n"
+	"	q[0].z = n[0].x*p[0].y;\n"
+	"  }\n"
+	"  else {\n"
+	"    // choose p in x-y plane\n"
+	"    float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
+	"    float k = 1.f/sqrt(a);\n"
+	"    p[0].x = -n[0].y*k;\n"
+	"	p[0].y = n[0].x*k;\n"
+	"	p[0].z = 0;\n"
+	"    // set q = n x p\n"
+	"    q[0].x = -n[0].z*p[0].y;\n"
+	"	q[0].y = n[0].z*p[0].x;\n"
+	"	q[0].z = a*k;\n"
+	"  }\n"
+	"}\n"
+	"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
+	"void solveContactConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
+	"{\n"
+	"	//float frictionCoeff = ldsCs[0].m_linear.w;\n"
+	"	int aIdx = ldsCs[0].m_bodyA;\n"
+	"	int bIdx = ldsCs[0].m_bodyB;\n"
+	"	float4 posA = gBodies[aIdx].m_pos;\n"
+	"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
+	"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
+	"	float invMassA = gBodies[aIdx].m_invMass;\n"
+	"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+	"	float4 posB = gBodies[bIdx].m_pos;\n"
+	"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
+	"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
+	"	float invMassB = gBodies[bIdx].m_invMass;\n"
+	"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
+	"	solveContact( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
+	"			posB, &linVelB, &angVelB, invMassB, invInertiaB );\n"
+	"  if (gBodies[aIdx].m_invMass)\n"
+	"  {\n"
+	"		gBodies[aIdx].m_linVel = linVelA;\n"
+	"		gBodies[aIdx].m_angVel = angVelA;\n"
+	"	} else\n"
+	"	{\n"
+	"		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+	"		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+	"	\n"
+	"	}\n"
+	"	if (gBodies[bIdx].m_invMass)\n"
+	"  {\n"
+	"		gBodies[bIdx].m_linVel = linVelB;\n"
+	"		gBodies[bIdx].m_angVel = angVelB;\n"
+	"	} else\n"
+	"	{\n"
+	"		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+	"		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+	"	\n"
+	"	}\n"
+	"}\n"
+	"typedef struct \n"
+	"{\n"
+	"	int m_valInt0;\n"
+	"	int m_valInt1;\n"
+	"	int m_valInt2;\n"
+	"	int m_valInt3;\n"
+	"	float m_val0;\n"
+	"	float m_val1;\n"
+	"	float m_val2;\n"
+	"	float m_val3;\n"
+	"} SolverDebugInfo;\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void BatchSolveKernelContact(__global Body* gBodies,\n"
+	"                      __global Shape* gShapes,\n"
+	"                      __global Constraint4* gConstraints,\n"
+	"                      __global int* gN,\n"
+	"                      __global int* gOffsets,\n"
+	"                      __global	int* batchSizes,\n"
+	"                       int maxBatch1,\n"
+	"                       int cellBatch,\n"
+	"                       int4 nSplit\n"
+	"                      )\n"
+	"{\n"
+	"	//__local int ldsBatchIdx[WG_SIZE+1];\n"
+	"	__local int ldsCurBatch;\n"
+	"	__local int ldsNextBatch;\n"
+	"	__local int ldsStart;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int wgIdx = GET_GROUP_IDX;\n"
+	"//	int gIdx = GET_GLOBAL_IDX;\n"
+	"//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
+	"	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
+	"	\n"
+	"	\n"
+	"	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
+	"	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
+	"	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
+	"	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
+	"	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
+	"	//int xIdx = (wgIdx/(nSplit/2))*2 + (bIdx&1);\n"
+	"	//int yIdx = (wgIdx%(nSplit/2))*2 + (bIdx>>1);\n"
+	"	//int cellIdx = xIdx+yIdx*nSplit;\n"
+	"	\n"
+	"	if( gN[cellIdx] == 0 ) \n"
+	"		return;\n"
+	"	int maxBatch = batchSizes[cellIdx];\n"
+	"	\n"
+	"	\n"
+	"	const int start = gOffsets[cellIdx];\n"
+	"	const int end = start + gN[cellIdx];\n"
+	"	\n"
+	"	\n"
+	"	\n"
+	"	if( lIdx == 0 )\n"
+	"	{\n"
+	"		ldsCurBatch = 0;\n"
+	"		ldsNextBatch = 0;\n"
+	"		ldsStart = start;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	int idx=ldsStart+lIdx;\n"
+	"	while (ldsCurBatch < maxBatch)\n"
+	"	{\n"
+	"		for(; idx<end; )\n"
+	"		{\n"
+	"			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
+	"			{\n"
+	"					solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+	"				 idx+=64;\n"
+	"			} else\n"
+	"			{\n"
+	"				break;\n"
+	"			}\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	\n"
+	"		if( lIdx == 0 )\n"
+	"		{\n"
+	"			ldsCurBatch++;\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"	\n"
+	"    \n"
+	"}\n"
+	"__kernel void solveSingleContactKernel(__global Body* gBodies,\n"
+	"                      __global Shape* gShapes,\n"
+	"                      __global Constraint4* gConstraints,\n"
+	"                       int cellIdx,\n"
+	"                       int batchOffset,\n"
+	"                       int numConstraintsInBatch\n"
+	"                      )\n"
+	"{\n"
+	"	int index = get_global_id(0);\n"
+	"	if (index < numConstraintsInBatch)\n"
+	"	{\n"
+	"		int idx=batchOffset+index;\n"
+	"		solveContactConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+	"	}    \n"
+	"}\n";
--- a/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solveFriction.h
@@ -1,421 +1,420 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* solveFrictionCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
-"#ifdef cl_ext_atomic_counters_32\n"
-"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
-"#else\n"
-"#define counter32_t volatile global int*\n"
-"#endif\n"
-"typedef unsigned int u32;\n"
-"typedef unsigned short u16;\n"
-"typedef unsigned char u8;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GET_NUM_GROUPS get_num_groups(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define AppendInc(x, out) out = atomic_inc(x)\n"
-"#define AtomAdd(x, value) atom_add(&(x), value)\n"
-"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
-"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
-"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
-"#define mymake_float4 (float4)\n"
-"//#define make_float2 (float2)\n"
-"//#define make_uint4 (uint4)\n"
-"//#define make_int4 (int4)\n"
-"//#define make_uint2 (uint2)\n"
-"//#define make_int2 (int2)\n"
-"#define max2 max\n"
-"#define min2 min\n"
-"///////////////////////////////////////\n"
-"//	Vector\n"
-"///////////////////////////////////////\n"
-"__inline\n"
-"float4 fastNormalize4(float4 v)\n"
-"{\n"
-"	return fast_normalize(v);\n"
-"}\n"
-"__inline\n"
-"float4 cross3(float4 a, float4 b)\n"
-"{\n"
-"	return cross(a,b);\n"
-"}\n"
-"__inline\n"
-"float dot3F4(float4 a, float4 b)\n"
-"{\n"
-"	float4 a1 = mymake_float4(a.xyz,0.f);\n"
-"	float4 b1 = mymake_float4(b.xyz,0.f);\n"
-"	return dot(a1, b1);\n"
-"}\n"
-"__inline\n"
-"float4 normalize3(const float4 a)\n"
-"{\n"
-"	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
-"	return fastNormalize4( n );\n"
-"//	float length = sqrtf(dot3F4(a, a));\n"
-"//	return 1.f/length * a;\n"
-"}\n"
-"///////////////////////////////////////\n"
-"//	Matrix3x3\n"
-"///////////////////////////////////////\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_row[3];\n"
-"}Matrix3x3;\n"
-"__inline\n"
-"float4 mtMul1(Matrix3x3 a, float4 b);\n"
-"__inline\n"
-"float4 mtMul3(float4 a, Matrix3x3 b);\n"
-"__inline\n"
-"float4 mtMul1(Matrix3x3 a, float4 b)\n"
-"{\n"
-"	float4 ans;\n"
-"	ans.x = dot3F4( a.m_row[0], b );\n"
-"	ans.y = dot3F4( a.m_row[1], b );\n"
-"	ans.z = dot3F4( a.m_row[2], b );\n"
-"	ans.w = 0.f;\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"float4 mtMul3(float4 a, Matrix3x3 b)\n"
-"{\n"
-"	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
-"	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
-"	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"	float4 ans;\n"
-"	ans.x = dot3F4( a, colx );\n"
-"	ans.y = dot3F4( a, coly );\n"
-"	ans.z = dot3F4( a, colz );\n"
-"	return ans;\n"
-"}\n"
-"///////////////////////////////////////\n"
-"//	Quaternion\n"
-"///////////////////////////////////////\n"
-"typedef float4 Quaternion;\n"
-"#define WG_SIZE 64\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_pos;\n"
-"	Quaternion m_quat;\n"
-"	float4 m_linVel;\n"
-"	float4 m_angVel;\n"
-"	u32 m_shapeIdx;\n"
-"	float m_invMass;\n"
-"	float m_restituitionCoeff;\n"
-"	float m_frictionCoeff;\n"
-"} Body;\n"
-"typedef struct\n"
-"{\n"
-"	Matrix3x3 m_invInertia;\n"
-"	Matrix3x3 m_initInvInertia;\n"
-"} Shape;\n"
-"typedef struct\n"
-"{\n"
-"	float4 m_linear;\n"
-"	float4 m_worldPos[4];\n"
-"	float4 m_center;	\n"
-"	float m_jacCoeffInv[4];\n"
-"	float m_b[4];\n"
-"	float m_appliedRambdaDt[4];\n"
-"	float m_fJacCoeffInv[2];	\n"
-"	float m_fAppliedRambdaDt[2];	\n"
-"	u32 m_bodyA;\n"
-"	u32 m_bodyB;\n"
-"	int m_batchIdx;\n"
-"	u32 m_paddings[1];\n"
-"} Constraint4;\n"
-"typedef struct\n"
-"{\n"
-"	int m_nConstraints;\n"
-"	int m_start;\n"
-"	int m_batchIdx;\n"
-"	int m_nSplit;\n"
-"//	int m_paddings[1];\n"
-"} ConstBuffer;\n"
-"typedef struct\n"
-"{\n"
-"	int m_solveFriction;\n"
-"	int m_maxBatch;	//	long batch really kills the performance\n"
-"	int m_batchIdx;\n"
-"	int m_nSplit;\n"
-"//	int m_paddings[1];\n"
-"} ConstBufferBatchSolve;\n"
-"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
-"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
-"{\n"
-"	*linear = mymake_float4(-n.xyz,0.f);\n"
-"	*angular0 = -cross3(r0, n);\n"
-"	*angular1 = cross3(r1, n);\n"
-"}\n"
-"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
-"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
-"{\n"
-"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
-"}\n"
-"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
-"				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
-"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
-"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
-"{\n"
-"	//	linear0,1 are normlized\n"
-"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
-"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
-"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
-"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
-"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
-"}\n"
-"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
-" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
-"{\n"
-"  if (fabs(n[0].z) > 0.70710678f) {\n"
-"    // choose p in y-z plane\n"
-"    float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
-"    float k = 1.f/sqrt(a);\n"
-"    p[0].x = 0;\n"
-"	p[0].y = -n[0].z*k;\n"
-"	p[0].z = n[0].y*k;\n"
-"    // set q = n x p\n"
-"    q[0].x = a*k;\n"
-"	q[0].y = -n[0].x*p[0].z;\n"
-"	q[0].z = n[0].x*p[0].y;\n"
-"  }\n"
-"  else {\n"
-"    // choose p in x-y plane\n"
-"    float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
-"    float k = 1.f/sqrt(a);\n"
-"    p[0].x = -n[0].y*k;\n"
-"	p[0].y = n[0].x*k;\n"
-"	p[0].z = 0;\n"
-"    // set q = n x p\n"
-"    q[0].x = -n[0].z*p[0].y;\n"
-"	q[0].y = n[0].z*p[0].x;\n"
-"	q[0].z = a*k;\n"
-"  }\n"
-"}\n"
-"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
-"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
-"{\n"
-"	float frictionCoeff = ldsCs[0].m_linear.w;\n"
-"	int aIdx = ldsCs[0].m_bodyA;\n"
-"	int bIdx = ldsCs[0].m_bodyB;\n"
-"	float4 posA = gBodies[aIdx].m_pos;\n"
-"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
-"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
-"	float invMassA = gBodies[aIdx].m_invMass;\n"
-"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
-"	float4 posB = gBodies[bIdx].m_pos;\n"
-"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
-"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
-"	float invMassB = gBodies[bIdx].m_invMass;\n"
-"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
-"	\n"
-"	{\n"
-"		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
-"		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
-"		float sum = 0;\n"
-"		for(int j=0; j<4; j++)\n"
-"		{\n"
-"			sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
-"		}\n"
-"		frictionCoeff = 0.7f;\n"
-"		for(int j=0; j<4; j++)\n"
-"		{\n"
-"			maxRambdaDt[j] = frictionCoeff*sum;\n"
-"			minRambdaDt[j] = -maxRambdaDt[j];\n"
-"		}\n"
-"		\n"
-"//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
-"//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
-"		\n"
-"		\n"
-"		{\n"
-"			\n"
-"			__global Constraint4* cs = ldsCs;\n"
-"			\n"
-"			if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
-"			const float4 center = cs->m_center;\n"
-"			\n"
-"			float4 n = -cs->m_linear;\n"
-"			\n"
-"			float4 tangent[2];\n"
-"			btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
-"			float4 angular0, angular1, linear;\n"
-"			float4 r0 = center - posA;\n"
-"			float4 r1 = center - posB;\n"
-"			for(int i=0; i<2; i++)\n"
-"			{\n"
-"				setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
-"				float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
-"											linVelA, angVelA, linVelB, angVelB );\n"
-"				rambdaDt *= cs->m_fJacCoeffInv[i];\n"
-"				\n"
-"				{\n"
-"					float prevSum = cs->m_fAppliedRambdaDt[i];\n"
-"					float updated = prevSum;\n"
-"					updated += rambdaDt;\n"
-"					updated = max2( updated, minRambdaDt[i] );\n"
-"					updated = min2( updated, maxRambdaDt[i] );\n"
-"					rambdaDt = updated - prevSum;\n"
-"					cs->m_fAppliedRambdaDt[i] = updated;\n"
-"				}\n"
-"				\n"
-"				float4 linImp0 = invMassA*linear*rambdaDt;\n"
-"				float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
-"				float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
-"				float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
-"				\n"
-"				linVelA += linImp0;\n"
-"				angVelA += angImp0;\n"
-"				linVelB += linImp1;\n"
-"				angVelB += angImp1;\n"
-"			}\n"
-"			{	//	angular damping for point constraint\n"
-"				float4 ab = normalize3( posB - posA );\n"
-"				float4 ac = normalize3( center - posA );\n"
-"				if( dot3F4( ab, ac ) > 0.95f  || (invMassA == 0.f || invMassB == 0.f))\n"
-"				{\n"
-"					float angNA = dot3F4( n, angVelA );\n"
-"					float angNB = dot3F4( n, angVelB );\n"
-"					\n"
-"					angVelA -= (angNA*0.1f)*n;\n"
-"					angVelB -= (angNB*0.1f)*n;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		\n"
-"	}\n"
-"	if (gBodies[aIdx].m_invMass)\n"
-"	{\n"
-"		gBodies[aIdx].m_linVel = linVelA;\n"
-"		gBodies[aIdx].m_angVel = angVelA;\n"
-"	} else\n"
-"	{\n"
-"		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
-"		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
-"	}\n"
-"	if (gBodies[bIdx].m_invMass)\n"
-"	{\n"
-"		gBodies[bIdx].m_linVel = linVelB;\n"
-"		gBodies[bIdx].m_angVel = angVelB;\n"
-"	} else\n"
-"	{\n"
-"		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
-"		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
-"	}\n"
-" \n"
-"}\n"
-"typedef struct \n"
-"{\n"
-"	int m_valInt0;\n"
-"	int m_valInt1;\n"
-"	int m_valInt2;\n"
-"	int m_valInt3;\n"
-"	float m_val0;\n"
-"	float m_val1;\n"
-"	float m_val2;\n"
-"	float m_val3;\n"
-"} SolverDebugInfo;\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"void BatchSolveKernelFriction(__global Body* gBodies,\n"
-"                      __global Shape* gShapes,\n"
-"                      __global Constraint4* gConstraints,\n"
-"                      __global int* gN,\n"
-"                      __global int* gOffsets,\n"
-"                      __global int* batchSizes,\n"
-"                       int maxBatch1,\n"
-"                       int cellBatch,\n"
-"                       int4 nSplit\n"
-"                      )\n"
-"{\n"
-"	//__local int ldsBatchIdx[WG_SIZE+1];\n"
-"	__local int ldsCurBatch;\n"
-"	__local int ldsNextBatch;\n"
-"	__local int ldsStart;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int wgIdx = GET_GROUP_IDX;\n"
-"//	int gIdx = GET_GLOBAL_IDX;\n"
-"//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
-"	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
-"	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
-"	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
-"	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
-"	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
-"	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
-"	\n"
-"	if( gN[cellIdx] == 0 ) \n"
-"		return;\n"
-"	int maxBatch = batchSizes[cellIdx];\n"
-"	const int start = gOffsets[cellIdx];\n"
-"	const int end = start + gN[cellIdx];\n"
-"	\n"
-"	if( lIdx == 0 )\n"
-"	{\n"
-"		ldsCurBatch = 0;\n"
-"		ldsNextBatch = 0;\n"
-"		ldsStart = start;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	int idx=ldsStart+lIdx;\n"
-"	while (ldsCurBatch < maxBatch)\n"
-"	{\n"
-"		for(; idx<end; )\n"
-"		{\n"
-"			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
-"			{\n"
-"					solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"				 idx+=64;\n"
-"			} else\n"
-"			{\n"
-"				break;\n"
-"			}\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"		if( lIdx == 0 )\n"
-"		{\n"
-"			ldsCurBatch++;\n"
-"		}\n"
-"		GROUP_LDS_BARRIER;\n"
-"	}\n"
-"	\n"
-"    \n"
-"}\n"
-"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n"
-"                      __global Shape* gShapes,\n"
-"                      __global Constraint4* gConstraints,\n"
-"                       int cellIdx,\n"
-"                       int batchOffset,\n"
-"                       int numConstraintsInBatch\n"
-"                      )\n"
-"{\n"
-"	int index = get_global_id(0);\n"
-"	if (index < numConstraintsInBatch)\n"
-"	{\n"
-"		\n"
-"		int idx=batchOffset+index;\n"
-"	\n"
-"		solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
-"	}    \n"
-"}\n"
-;
+static const char* solveFrictionCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"//#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+	"#ifdef cl_ext_atomic_counters_32\n"
+	"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+	"#else\n"
+	"#define counter32_t volatile global int*\n"
+	"#endif\n"
+	"typedef unsigned int u32;\n"
+	"typedef unsigned short u16;\n"
+	"typedef unsigned char u8;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GET_NUM_GROUPS get_num_groups(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define AppendInc(x, out) out = atomic_inc(x)\n"
+	"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+	"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+	"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+	"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+	"#define mymake_float4 (float4)\n"
+	"//#define make_float2 (float2)\n"
+	"//#define make_uint4 (uint4)\n"
+	"//#define make_int4 (int4)\n"
+	"//#define make_uint2 (uint2)\n"
+	"//#define make_int2 (int2)\n"
+	"#define max2 max\n"
+	"#define min2 min\n"
+	"///////////////////////////////////////\n"
+	"//	Vector\n"
+	"///////////////////////////////////////\n"
+	"__inline\n"
+	"float4 fastNormalize4(float4 v)\n"
+	"{\n"
+	"	return fast_normalize(v);\n"
+	"}\n"
+	"__inline\n"
+	"float4 cross3(float4 a, float4 b)\n"
+	"{\n"
+	"	return cross(a,b);\n"
+	"}\n"
+	"__inline\n"
+	"float dot3F4(float4 a, float4 b)\n"
+	"{\n"
+	"	float4 a1 = mymake_float4(a.xyz,0.f);\n"
+	"	float4 b1 = mymake_float4(b.xyz,0.f);\n"
+	"	return dot(a1, b1);\n"
+	"}\n"
+	"__inline\n"
+	"float4 normalize3(const float4 a)\n"
+	"{\n"
+	"	float4 n = mymake_float4(a.x, a.y, a.z, 0.f);\n"
+	"	return fastNormalize4( n );\n"
+	"//	float length = sqrtf(dot3F4(a, a));\n"
+	"//	return 1.f/length * a;\n"
+	"}\n"
+	"///////////////////////////////////////\n"
+	"//	Matrix3x3\n"
+	"///////////////////////////////////////\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_row[3];\n"
+	"}Matrix3x3;\n"
+	"__inline\n"
+	"float4 mtMul1(Matrix3x3 a, float4 b);\n"
+	"__inline\n"
+	"float4 mtMul3(float4 a, Matrix3x3 b);\n"
+	"__inline\n"
+	"float4 mtMul1(Matrix3x3 a, float4 b)\n"
+	"{\n"
+	"	float4 ans;\n"
+	"	ans.x = dot3F4( a.m_row[0], b );\n"
+	"	ans.y = dot3F4( a.m_row[1], b );\n"
+	"	ans.z = dot3F4( a.m_row[2], b );\n"
+	"	ans.w = 0.f;\n"
+	"	return ans;\n"
+	"}\n"
+	"__inline\n"
+	"float4 mtMul3(float4 a, Matrix3x3 b)\n"
+	"{\n"
+	"	float4 colx = mymake_float4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+	"	float4 coly = mymake_float4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+	"	float4 colz = mymake_float4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+	"	float4 ans;\n"
+	"	ans.x = dot3F4( a, colx );\n"
+	"	ans.y = dot3F4( a, coly );\n"
+	"	ans.z = dot3F4( a, colz );\n"
+	"	return ans;\n"
+	"}\n"
+	"///////////////////////////////////////\n"
+	"//	Quaternion\n"
+	"///////////////////////////////////////\n"
+	"typedef float4 Quaternion;\n"
+	"#define WG_SIZE 64\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_pos;\n"
+	"	Quaternion m_quat;\n"
+	"	float4 m_linVel;\n"
+	"	float4 m_angVel;\n"
+	"	u32 m_shapeIdx;\n"
+	"	float m_invMass;\n"
+	"	float m_restituitionCoeff;\n"
+	"	float m_frictionCoeff;\n"
+	"} Body;\n"
+	"typedef struct\n"
+	"{\n"
+	"	Matrix3x3 m_invInertia;\n"
+	"	Matrix3x3 m_initInvInertia;\n"
+	"} Shape;\n"
+	"typedef struct\n"
+	"{\n"
+	"	float4 m_linear;\n"
+	"	float4 m_worldPos[4];\n"
+	"	float4 m_center;	\n"
+	"	float m_jacCoeffInv[4];\n"
+	"	float m_b[4];\n"
+	"	float m_appliedRambdaDt[4];\n"
+	"	float m_fJacCoeffInv[2];	\n"
+	"	float m_fAppliedRambdaDt[2];	\n"
+	"	u32 m_bodyA;\n"
+	"	u32 m_bodyB;\n"
+	"	int m_batchIdx;\n"
+	"	u32 m_paddings[1];\n"
+	"} Constraint4;\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_nConstraints;\n"
+	"	int m_start;\n"
+	"	int m_batchIdx;\n"
+	"	int m_nSplit;\n"
+	"//	int m_paddings[1];\n"
+	"} ConstBuffer;\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_solveFriction;\n"
+	"	int m_maxBatch;	//	long batch really kills the performance\n"
+	"	int m_batchIdx;\n"
+	"	int m_nSplit;\n"
+	"//	int m_paddings[1];\n"
+	"} ConstBufferBatchSolve;\n"
+	"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1);\n"
+	"void setLinearAndAngular( float4 n, float4 r0, float4 r1, float4* linear, float4* angular0, float4* angular1)\n"
+	"{\n"
+	"	*linear = mymake_float4(-n.xyz,0.f);\n"
+	"	*angular0 = -cross3(r0, n);\n"
+	"	*angular1 = cross3(r1, n);\n"
+	"}\n"
+	"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 );\n"
+	"float calcRelVel( float4 l0, float4 l1, float4 a0, float4 a1, float4 linVel0, float4 angVel0, float4 linVel1, float4 angVel1 )\n"
+	"{\n"
+	"	return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);\n"
+	"}\n"
+	"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+	"				   float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1);\n"
+	"float calcJacCoeff(const float4 linear0, const float4 linear1, const float4 angular0, const float4 angular1,\n"
+	"					float invMass0, const Matrix3x3* invInertia0, float invMass1, const Matrix3x3* invInertia1)\n"
+	"{\n"
+	"	//	linear0,1 are normlized\n"
+	"	float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;\n"
+	"	float jmj1 = dot3F4(mtMul3(angular0,*invInertia0), angular0);\n"
+	"	float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;\n"
+	"	float jmj3 = dot3F4(mtMul3(angular1,*invInertia1), angular1);\n"
+	"	return -1.f/(jmj0+jmj1+jmj2+jmj3);\n"
+	"}\n"
+	"void btPlaneSpace1 (const float4* n, float4* p, float4* q);\n"
+	" void btPlaneSpace1 (const float4* n, float4* p, float4* q)\n"
+	"{\n"
+	"  if (fabs(n[0].z) > 0.70710678f) {\n"
+	"    // choose p in y-z plane\n"
+	"    float a = n[0].y*n[0].y + n[0].z*n[0].z;\n"
+	"    float k = 1.f/sqrt(a);\n"
+	"    p[0].x = 0;\n"
+	"	p[0].y = -n[0].z*k;\n"
+	"	p[0].z = n[0].y*k;\n"
+	"    // set q = n x p\n"
+	"    q[0].x = a*k;\n"
+	"	q[0].y = -n[0].x*p[0].z;\n"
+	"	q[0].z = n[0].x*p[0].y;\n"
+	"  }\n"
+	"  else {\n"
+	"    // choose p in x-y plane\n"
+	"    float a = n[0].x*n[0].x + n[0].y*n[0].y;\n"
+	"    float k = 1.f/sqrt(a);\n"
+	"    p[0].x = -n[0].y*k;\n"
+	"	p[0].y = n[0].x*k;\n"
+	"	p[0].z = 0;\n"
+	"    // set q = n x p\n"
+	"    q[0].x = -n[0].z*p[0].y;\n"
+	"	q[0].y = n[0].z*p[0].x;\n"
+	"	q[0].z = a*k;\n"
+	"  }\n"
+	"}\n"
+	"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs);\n"
+	"void solveFrictionConstraint(__global Body* gBodies, __global Shape* gShapes, __global Constraint4* ldsCs)\n"
+	"{\n"
+	"	float frictionCoeff = ldsCs[0].m_linear.w;\n"
+	"	int aIdx = ldsCs[0].m_bodyA;\n"
+	"	int bIdx = ldsCs[0].m_bodyB;\n"
+	"	float4 posA = gBodies[aIdx].m_pos;\n"
+	"	float4 linVelA = gBodies[aIdx].m_linVel;\n"
+	"	float4 angVelA = gBodies[aIdx].m_angVel;\n"
+	"	float invMassA = gBodies[aIdx].m_invMass;\n"
+	"	Matrix3x3 invInertiaA = gShapes[aIdx].m_invInertia;\n"
+	"	float4 posB = gBodies[bIdx].m_pos;\n"
+	"	float4 linVelB = gBodies[bIdx].m_linVel;\n"
+	"	float4 angVelB = gBodies[bIdx].m_angVel;\n"
+	"	float invMassB = gBodies[bIdx].m_invMass;\n"
+	"	Matrix3x3 invInertiaB = gShapes[bIdx].m_invInertia;\n"
+	"	\n"
+	"	{\n"
+	"		float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};\n"
+	"		float minRambdaDt[4] = {0.f,0.f,0.f,0.f};\n"
+	"		float sum = 0;\n"
+	"		for(int j=0; j<4; j++)\n"
+	"		{\n"
+	"			sum +=ldsCs[0].m_appliedRambdaDt[j];\n"
+	"		}\n"
+	"		frictionCoeff = 0.7f;\n"
+	"		for(int j=0; j<4; j++)\n"
+	"		{\n"
+	"			maxRambdaDt[j] = frictionCoeff*sum;\n"
+	"			minRambdaDt[j] = -maxRambdaDt[j];\n"
+	"		}\n"
+	"		\n"
+	"//		solveFriction( ldsCs, posA, &linVelA, &angVelA, invMassA, invInertiaA,\n"
+	"//			posB, &linVelB, &angVelB, invMassB, invInertiaB, maxRambdaDt, minRambdaDt );\n"
+	"		\n"
+	"		\n"
+	"		{\n"
+	"			\n"
+	"			__global Constraint4* cs = ldsCs;\n"
+	"			\n"
+	"			if( cs->m_fJacCoeffInv[0] == 0 && cs->m_fJacCoeffInv[0] == 0 ) return;\n"
+	"			const float4 center = cs->m_center;\n"
+	"			\n"
+	"			float4 n = -cs->m_linear;\n"
+	"			\n"
+	"			float4 tangent[2];\n"
+	"			btPlaneSpace1(&n,&tangent[0],&tangent[1]);\n"
+	"			float4 angular0, angular1, linear;\n"
+	"			float4 r0 = center - posA;\n"
+	"			float4 r1 = center - posB;\n"
+	"			for(int i=0; i<2; i++)\n"
+	"			{\n"
+	"				setLinearAndAngular( tangent[i], r0, r1, &linear, &angular0, &angular1 );\n"
+	"				float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,\n"
+	"											linVelA, angVelA, linVelB, angVelB );\n"
+	"				rambdaDt *= cs->m_fJacCoeffInv[i];\n"
+	"				\n"
+	"				{\n"
+	"					float prevSum = cs->m_fAppliedRambdaDt[i];\n"
+	"					float updated = prevSum;\n"
+	"					updated += rambdaDt;\n"
+	"					updated = max2( updated, minRambdaDt[i] );\n"
+	"					updated = min2( updated, maxRambdaDt[i] );\n"
+	"					rambdaDt = updated - prevSum;\n"
+	"					cs->m_fAppliedRambdaDt[i] = updated;\n"
+	"				}\n"
+	"				\n"
+	"				float4 linImp0 = invMassA*linear*rambdaDt;\n"
+	"				float4 linImp1 = invMassB*(-linear)*rambdaDt;\n"
+	"				float4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;\n"
+	"				float4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;\n"
+	"				\n"
+	"				linVelA += linImp0;\n"
+	"				angVelA += angImp0;\n"
+	"				linVelB += linImp1;\n"
+	"				angVelB += angImp1;\n"
+	"			}\n"
+	"			{	//	angular damping for point constraint\n"
+	"				float4 ab = normalize3( posB - posA );\n"
+	"				float4 ac = normalize3( center - posA );\n"
+	"				if( dot3F4( ab, ac ) > 0.95f  || (invMassA == 0.f || invMassB == 0.f))\n"
+	"				{\n"
+	"					float angNA = dot3F4( n, angVelA );\n"
+	"					float angNB = dot3F4( n, angVelB );\n"
+	"					\n"
+	"					angVelA -= (angNA*0.1f)*n;\n"
+	"					angVelB -= (angNB*0.1f)*n;\n"
+	"				}\n"
+	"			}\n"
+	"		}\n"
+	"		\n"
+	"		\n"
+	"	}\n"
+	"	if (gBodies[aIdx].m_invMass)\n"
+	"	{\n"
+	"		gBodies[aIdx].m_linVel = linVelA;\n"
+	"		gBodies[aIdx].m_angVel = angVelA;\n"
+	"	} else\n"
+	"	{\n"
+	"		gBodies[aIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+	"		gBodies[aIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+	"	}\n"
+	"	if (gBodies[bIdx].m_invMass)\n"
+	"	{\n"
+	"		gBodies[bIdx].m_linVel = linVelB;\n"
+	"		gBodies[bIdx].m_angVel = angVelB;\n"
+	"	} else\n"
+	"	{\n"
+	"		gBodies[bIdx].m_linVel = mymake_float4(0,0,0,0);\n"
+	"		gBodies[bIdx].m_angVel = mymake_float4(0,0,0,0);\n"
+	"	}\n"
+	" \n"
+	"}\n"
+	"typedef struct \n"
+	"{\n"
+	"	int m_valInt0;\n"
+	"	int m_valInt1;\n"
+	"	int m_valInt2;\n"
+	"	int m_valInt3;\n"
+	"	float m_val0;\n"
+	"	float m_val1;\n"
+	"	float m_val2;\n"
+	"	float m_val3;\n"
+	"} SolverDebugInfo;\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"void BatchSolveKernelFriction(__global Body* gBodies,\n"
+	"                      __global Shape* gShapes,\n"
+	"                      __global Constraint4* gConstraints,\n"
+	"                      __global int* gN,\n"
+	"                      __global int* gOffsets,\n"
+	"                      __global int* batchSizes,\n"
+	"                       int maxBatch1,\n"
+	"                       int cellBatch,\n"
+	"                       int4 nSplit\n"
+	"                      )\n"
+	"{\n"
+	"	//__local int ldsBatchIdx[WG_SIZE+1];\n"
+	"	__local int ldsCurBatch;\n"
+	"	__local int ldsNextBatch;\n"
+	"	__local int ldsStart;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int wgIdx = GET_GROUP_IDX;\n"
+	"//	int gIdx = GET_GLOBAL_IDX;\n"
+	"//	debugInfo[gIdx].m_valInt0 = gIdx;\n"
+	"	//debugInfo[gIdx].m_valInt1 = GET_GROUP_SIZE;\n"
+	"	int zIdx = (wgIdx/((nSplit.x*nSplit.y)/4))*2+((cellBatch&4)>>2);\n"
+	"	int remain= (wgIdx%((nSplit.x*nSplit.y)/4));\n"
+	"	int yIdx = (remain/(nSplit.x/2))*2 + ((cellBatch&2)>>1);\n"
+	"	int xIdx = (remain%(nSplit.x/2))*2 + (cellBatch&1);\n"
+	"	int cellIdx = xIdx+yIdx*nSplit.x+zIdx*(nSplit.x*nSplit.y);\n"
+	"	\n"
+	"	if( gN[cellIdx] == 0 ) \n"
+	"		return;\n"
+	"	int maxBatch = batchSizes[cellIdx];\n"
+	"	const int start = gOffsets[cellIdx];\n"
+	"	const int end = start + gN[cellIdx];\n"
+	"	\n"
+	"	if( lIdx == 0 )\n"
+	"	{\n"
+	"		ldsCurBatch = 0;\n"
+	"		ldsNextBatch = 0;\n"
+	"		ldsStart = start;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	int idx=ldsStart+lIdx;\n"
+	"	while (ldsCurBatch < maxBatch)\n"
+	"	{\n"
+	"		for(; idx<end; )\n"
+	"		{\n"
+	"			if (gConstraints[idx].m_batchIdx == ldsCurBatch)\n"
+	"			{\n"
+	"					solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+	"				 idx+=64;\n"
+	"			} else\n"
+	"			{\n"
+	"				break;\n"
+	"			}\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"		if( lIdx == 0 )\n"
+	"		{\n"
+	"			ldsCurBatch++;\n"
+	"		}\n"
+	"		GROUP_LDS_BARRIER;\n"
+	"	}\n"
+	"	\n"
+	"    \n"
+	"}\n"
+	"__kernel void solveSingleFrictionKernel(__global Body* gBodies,\n"
+	"                      __global Shape* gShapes,\n"
+	"                      __global Constraint4* gConstraints,\n"
+	"                       int cellIdx,\n"
+	"                       int batchOffset,\n"
+	"                       int numConstraintsInBatch\n"
+	"                      )\n"
+	"{\n"
+	"	int index = get_global_id(0);\n"
+	"	if (index < numConstraintsInBatch)\n"
+	"	{\n"
+	"		\n"
+	"		int idx=batchOffset+index;\n"
+	"	\n"
+	"		solveFrictionConstraint( gBodies, gShapes, &gConstraints[idx] );\n"
+	"	}    \n"
+	"}\n";
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
--- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
@@ -1,483 +1,482 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* updateAabbsKernelCL= \
-"#ifndef B3_UPDATE_AABBS_H\n"
-"#define B3_UPDATE_AABBS_H\n"
-"#ifndef B3_AABB_H\n"
-"#define B3_AABB_H\n"
-"#ifndef B3_FLOAT4_H\n"
-"#define B3_FLOAT4_H\n"
-"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
-"#define B3_PLATFORM_DEFINITIONS_H\n"
-"struct MyTest\n"
-"{\n"
-"	int bla;\n"
-"};\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
-"#define B3_LARGE_FLOAT 1e18f\n"
-"#define B3_INFINITY 1e18f\n"
-"#define b3Assert(a)\n"
-"#define b3ConstArray(a) __global const a*\n"
-"#define b3AtomicInc atomic_inc\n"
-"#define b3AtomicAdd atomic_add\n"
-"#define b3Fabs fabs\n"
-"#define b3Sqrt native_sqrt\n"
-"#define b3Sin native_sin\n"
-"#define b3Cos native_cos\n"
-"#define B3_STATIC\n"
-"#endif\n"
-"#endif\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"	typedef float4	b3Float4;\n"
-"	#define b3Float4ConstArg const b3Float4\n"
-"	#define b3MakeFloat4 (float4)\n"
-"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return dot(a1, b1);\n"
-"	}\n"
-"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
-"	{\n"
-"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
-"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
-"		return cross(a1, b1);\n"
-"	}\n"
-"	#define b3MinFloat4 min\n"
-"	#define b3MaxFloat4 max\n"
-"	#define b3Normalized(a) normalize(a)\n"
-"#endif \n"
-"		\n"
-"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
-"{\n"
-"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
-"		return false;\n"
-"	return true;\n"
-"}\n"
-"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
-"{\n"
-"    float maxDot = -B3_INFINITY;\n"
-"    int i = 0;\n"
-"    int ptIndex = -1;\n"
-"    for( i = 0; i < vecLen; i++ )\n"
-"    {\n"
-"        float dot = b3Dot3F4(vecArray[i],vec);\n"
-"            \n"
-"        if( dot > maxDot )\n"
-"        {\n"
-"            maxDot = dot;\n"
-"            ptIndex = i;\n"
-"        }\n"
-"    }\n"
-"	b3Assert(ptIndex>=0);\n"
-"    if (ptIndex<0)\n"
-"	{\n"
-"		ptIndex = 0;\n"
-"	}\n"
-"    *dotOut = maxDot;\n"
-"    return ptIndex;\n"
-"}\n"
-"#endif //B3_FLOAT4_H\n"
-"#ifndef B3_MAT3x3_H\n"
-"#define B3_MAT3x3_H\n"
-"#ifndef B3_QUAT_H\n"
-"#define B3_QUAT_H\n"
-"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif\n"
-"#endif\n"
-"#ifndef B3_FLOAT4_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif \n"
-"#endif //B3_FLOAT4_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"	typedef float4	b3Quat;\n"
-"	#define b3QuatConstArg const b3Quat\n"
-"	\n"
-"	\n"
-"inline float4 b3FastNormalize4(float4 v)\n"
-"{\n"
-"	v = (float4)(v.xyz,0.f);\n"
-"	return fast_normalize(v);\n"
-"}\n"
-"	\n"
-"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
-"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
-"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
-"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
-"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
-"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
-"{\n"
-"	b3Quat ans;\n"
-"	ans = b3Cross3( a, b );\n"
-"	ans += a.w*b+b.w*a;\n"
-"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
-"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
-"	return ans;\n"
-"}\n"
-"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
-"{\n"
-"	b3Quat q;\n"
-"	q=in;\n"
-"	//return b3FastNormalize4(in);\n"
-"	float len = native_sqrt(dot(q, q));\n"
-"	if(len > 0.f)\n"
-"	{\n"
-"		q *= 1.f / len;\n"
-"	}\n"
-"	else\n"
-"	{\n"
-"		q.x = q.y = q.z = 0.f;\n"
-"		q.w = 1.f;\n"
-"	}\n"
-"	return q;\n"
-"}\n"
-"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
-"{\n"
-"	b3Quat qInv = b3QuatInvert( q );\n"
-"	float4 vcpy = vec;\n"
-"	vcpy.w = 0.f;\n"
-"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
-"	return out;\n"
-"}\n"
-"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
-"{\n"
-"	return (b3Quat)(-q.xyz, q.w);\n"
-"}\n"
-"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
-"{\n"
-"	return (b3Quat)(-q.xyz, q.w);\n"
-"}\n"
-"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
-"{\n"
-"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
-"}\n"
-"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
-"{\n"
-"	return b3QuatRotate( orientation, point ) + (translation);\n"
-"}\n"
-"	\n"
-"#endif \n"
-"#endif //B3_QUAT_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"typedef struct\n"
-"{\n"
-"	b3Float4 m_row[3];\n"
-"}b3Mat3x3;\n"
-"#define b3Mat3x3ConstArg const b3Mat3x3\n"
-"#define b3GetRow(m,row) (m.m_row[row])\n"
-"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
-"{\n"
-"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
-"	b3Mat3x3 out;\n"
-"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
-"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
-"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
-"	out.m_row[0].w = 0.f;\n"
-"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
-"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
-"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
-"	out.m_row[1].w = 0.f;\n"
-"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
-"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
-"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
-"	out.m_row[2].w = 0.f;\n"
-"	return out;\n"
-"}\n"
-"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
-"{\n"
-"	b3Mat3x3 out;\n"
-"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
-"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
-"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
-"	return out;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtZero();\n"
-"__inline\n"
-"b3Mat3x3 mtIdentity();\n"
-"__inline\n"
-"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
-"__inline\n"
-"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
-"__inline\n"
-"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
-"__inline\n"
-"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
-"__inline\n"
-"b3Mat3x3 mtZero()\n"
-"{\n"
-"	b3Mat3x3 m;\n"
-"	m.m_row[0] = (b3Float4)(0.f);\n"
-"	m.m_row[1] = (b3Float4)(0.f);\n"
-"	m.m_row[2] = (b3Float4)(0.f);\n"
-"	return m;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtIdentity()\n"
-"{\n"
-"	b3Mat3x3 m;\n"
-"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
-"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
-"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
-"	return m;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
-"{\n"
-"	b3Mat3x3 out;\n"
-"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
-"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
-"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
-"	return out;\n"
-"}\n"
-"__inline\n"
-"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
-"{\n"
-"	b3Mat3x3 transB;\n"
-"	transB = mtTranspose( b );\n"
-"	b3Mat3x3 ans;\n"
-"	//	why this doesn't run when 0ing in the for{}\n"
-"	a.m_row[0].w = 0.f;\n"
-"	a.m_row[1].w = 0.f;\n"
-"	a.m_row[2].w = 0.f;\n"
-"	for(int i=0; i<3; i++)\n"
-"	{\n"
-"//	a.m_row[i].w = 0.f;\n"
-"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
-"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
-"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
-"		ans.m_row[i].w = 0.f;\n"
-"	}\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
-"{\n"
-"	b3Float4 ans;\n"
-"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
-"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
-"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
-"	ans.w = 0.f;\n"
-"	return ans;\n"
-"}\n"
-"__inline\n"
-"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
-"{\n"
-"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
-"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
-"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
-"	b3Float4 ans;\n"
-"	ans.x = b3Dot3F4( a, colx );\n"
-"	ans.y = b3Dot3F4( a, coly );\n"
-"	ans.z = b3Dot3F4( a, colz );\n"
-"	return ans;\n"
-"}\n"
-"#endif\n"
-"#endif //B3_MAT3x3_H\n"
-"typedef struct b3Aabb b3Aabb_t;\n"
-"struct b3Aabb\n"
-"{\n"
-"	union\n"
-"	{\n"
-"		float m_min[4];\n"
-"		b3Float4 m_minVec;\n"
-"		int m_minIndices[4];\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		float	m_max[4];\n"
-"		b3Float4 m_maxVec;\n"
-"		int m_signedMaxIndices[4];\n"
-"	};\n"
-"};\n"
-"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
-"						b3Float4ConstArg pos,\n"
-"						b3QuatConstArg orn,\n"
-"						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
-"{\n"
-"		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
-"		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
-"		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
-"		b3Mat3x3 m;\n"
-"		m = b3QuatGetRotationMatrix(orn);\n"
-"		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
-"		b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
-"		\n"
-"		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
-"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
-"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
-"										 0.f);\n"
-"		*aabbMinOut = center-extent;\n"
-"		*aabbMaxOut = center+extent;\n"
-"}\n"
-"/// conservative test for overlap between two aabbs\n"
-"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
-"								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
-"{\n"
-"	bool overlap = true;\n"
-"	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
-"	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
-"	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"#endif //B3_AABB_H\n"
-"#ifndef B3_COLLIDABLE_H\n"
-"#define B3_COLLIDABLE_H\n"
-"#ifndef B3_FLOAT4_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif \n"
-"#endif //B3_FLOAT4_H\n"
-"#ifndef B3_QUAT_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif \n"
-"#endif //B3_QUAT_H\n"
-"enum b3ShapeTypes\n"
-"{\n"
-"	SHAPE_HEIGHT_FIELD=1,\n"
-"	SHAPE_CONVEX_HULL=3,\n"
-"	SHAPE_PLANE=4,\n"
-"	SHAPE_CONCAVE_TRIMESH=5,\n"
-"	SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n"
-"	SHAPE_SPHERE=7,\n"
-"	MAX_NUM_SHAPE_TYPES,\n"
-"};\n"
-"typedef struct b3Collidable b3Collidable_t;\n"
-"struct b3Collidable\n"
-"{\n"
-"	union {\n"
-"		int m_numChildShapes;\n"
-"		int m_bvhIndex;\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		float m_radius;\n"
-"		int	m_compoundBvhIndex;\n"
-"	};\n"
-"	int m_shapeType;\n"
-"	union\n"
-"	{\n"
-"		int m_shapeIndex;\n"
-"		float m_height;\n"
-"	};\n"
-"};\n"
-"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
-"struct b3GpuChildShape\n"
-"{\n"
-"	b3Float4	m_childPosition;\n"
-"	b3Quat		m_childOrientation;\n"
-"	union\n"
-"	{\n"
-"		int			m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
-"		int			m_capsuleAxis;\n"
-"	};\n"
-"	union \n"
-"	{\n"
-"		float		m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n"
-"		int			m_numChildShapes;//used for compound shape\n"
-"	};\n"
-"	union \n"
-"	{\n"
-"		float		m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n"
-"		int	m_collidableShapeIndex;\n"
-"	};\n"
-"	int			m_shapeType;\n"
-"};\n"
-"struct b3CompoundOverlappingPair\n"
-"{\n"
-"	int m_bodyIndexA;\n"
-"	int m_bodyIndexB;\n"
-"//	int	m_pairType;\n"
-"	int m_childShapeIndexA;\n"
-"	int m_childShapeIndexB;\n"
-"};\n"
-"#endif //B3_COLLIDABLE_H\n"
-"#ifndef B3_RIGIDBODY_DATA_H\n"
-"#define B3_RIGIDBODY_DATA_H\n"
-"#ifndef B3_FLOAT4_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif \n"
-"#endif //B3_FLOAT4_H\n"
-"#ifndef B3_QUAT_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif \n"
-"#endif //B3_QUAT_H\n"
-"#ifndef B3_MAT3x3_H\n"
-"#ifdef __cplusplus\n"
-"#else\n"
-"#endif\n"
-"#endif //B3_MAT3x3_H\n"
-"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
-"struct b3RigidBodyData\n"
-"{\n"
-"	b3Float4				m_pos;\n"
-"	b3Quat					m_quat;\n"
-"	b3Float4				m_linVel;\n"
-"	b3Float4				m_angVel;\n"
-"	int 					m_collidableIdx;\n"
-"	float 				m_invMass;\n"
-"	float 				m_restituitionCoeff;\n"
-"	float 				m_frictionCoeff;\n"
-"};\n"
-"typedef struct b3InertiaData b3InertiaData_t;\n"
-"struct b3InertiaData\n"
-"{\n"
-"	b3Mat3x3 m_invInertiaWorld;\n"
-"	b3Mat3x3 m_initInvInertia;\n"
-"};\n"
-"#endif //B3_RIGIDBODY_DATA_H\n"
-"	\n"
-"void b3ComputeWorldAabb(  int bodyId, __global const b3RigidBodyData_t* bodies, __global const  b3Collidable_t* collidables, __global const  b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n"
-"{\n"
-"	__global const b3RigidBodyData_t* body = &bodies[bodyId];\n"
-"	b3Float4 position = body->m_pos;\n"
-"	b3Quat	orientation = body->m_quat;\n"
-"	\n"
-"	int collidableIndex = body->m_collidableIdx;\n"
-"	int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n"
-"		\n"
-"	if (shapeIndex>=0)\n"
-"	{\n"
-"				\n"
-"		b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n"
-"		b3Aabb_t worldAabb;\n"
-"		\n"
-"		b3Float4 aabbAMinOut,aabbAMaxOut;	\n"
-"		float margin = 0.f;\n"
-"		b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n"
-"		\n"
-"		worldAabb.m_minVec =aabbAMinOut;\n"
-"		worldAabb.m_minIndices[3] = bodyId;\n"
-"		worldAabb.m_maxVec = aabbAMaxOut;\n"
-"		worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n"
-"		worldAabbs[bodyId] = worldAabb;\n"
-"	}\n"
-"}\n"
-"#endif //B3_UPDATE_AABBS_H\n"
-"__kernel void initializeGpuAabbsFull(  const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n"
-"{\n"
-"	int nodeID = get_global_id(0);\n"
-"	if( nodeID < numNodes )\n"
-"	{\n"
-"		b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n"
-"	}\n"
-"}\n"
-"__kernel void clearOverlappingPairsKernel(  __global int4* pairs, int numPairs)\n"
-"{\n"
-"	int pairId = get_global_id(0);\n"
-"	if( pairId< numPairs )\n"
-"	{\n"
-"		pairs[pairId].z = 0xffffffff;\n"
-"	}\n"
-"}\n"
-;
+static const char* updateAabbsKernelCL =
+	"#ifndef B3_UPDATE_AABBS_H\n"
+	"#define B3_UPDATE_AABBS_H\n"
+	"#ifndef B3_AABB_H\n"
+	"#define B3_AABB_H\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#define B3_FLOAT4_H\n"
+	"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+	"#define B3_PLATFORM_DEFINITIONS_H\n"
+	"struct MyTest\n"
+	"{\n"
+	"	int bla;\n"
+	"};\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+	"#define B3_LARGE_FLOAT 1e18f\n"
+	"#define B3_INFINITY 1e18f\n"
+	"#define b3Assert(a)\n"
+	"#define b3ConstArray(a) __global const a*\n"
+	"#define b3AtomicInc atomic_inc\n"
+	"#define b3AtomicAdd atomic_add\n"
+	"#define b3Fabs fabs\n"
+	"#define b3Sqrt native_sqrt\n"
+	"#define b3Sin native_sin\n"
+	"#define b3Cos native_cos\n"
+	"#define B3_STATIC\n"
+	"#endif\n"
+	"#endif\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"	typedef float4	b3Float4;\n"
+	"	#define b3Float4ConstArg const b3Float4\n"
+	"	#define b3MakeFloat4 (float4)\n"
+	"	float b3Dot3F4(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return dot(a1, b1);\n"
+	"	}\n"
+	"	b3Float4 b3Cross3(b3Float4ConstArg v0,b3Float4ConstArg v1)\n"
+	"	{\n"
+	"		float4 a1 = b3MakeFloat4(v0.xyz,0.f);\n"
+	"		float4 b1 = b3MakeFloat4(v1.xyz,0.f);\n"
+	"		return cross(a1, b1);\n"
+	"	}\n"
+	"	#define b3MinFloat4 min\n"
+	"	#define b3MaxFloat4 max\n"
+	"	#define b3Normalized(a) normalize(a)\n"
+	"#endif \n"
+	"		\n"
+	"inline bool b3IsAlmostZero(b3Float4ConstArg v)\n"
+	"{\n"
+	"	if(b3Fabs(v.x)>1e-6 || b3Fabs(v.y)>1e-6 || b3Fabs(v.z)>1e-6)	\n"
+	"		return false;\n"
+	"	return true;\n"
+	"}\n"
+	"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+	"{\n"
+	"    float maxDot = -B3_INFINITY;\n"
+	"    int i = 0;\n"
+	"    int ptIndex = -1;\n"
+	"    for( i = 0; i < vecLen; i++ )\n"
+	"    {\n"
+	"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+	"            \n"
+	"        if( dot > maxDot )\n"
+	"        {\n"
+	"            maxDot = dot;\n"
+	"            ptIndex = i;\n"
+	"        }\n"
+	"    }\n"
+	"	b3Assert(ptIndex>=0);\n"
+	"    if (ptIndex<0)\n"
+	"	{\n"
+	"		ptIndex = 0;\n"
+	"	}\n"
+	"    *dotOut = maxDot;\n"
+	"    return ptIndex;\n"
+	"}\n"
+	"#endif //B3_FLOAT4_H\n"
+	"#ifndef B3_MAT3x3_H\n"
+	"#define B3_MAT3x3_H\n"
+	"#ifndef B3_QUAT_H\n"
+	"#define B3_QUAT_H\n"
+	"#ifndef B3_PLATFORM_DEFINITIONS_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif\n"
+	"#endif\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif \n"
+	"#endif //B3_FLOAT4_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"	typedef float4	b3Quat;\n"
+	"	#define b3QuatConstArg const b3Quat\n"
+	"	\n"
+	"	\n"
+	"inline float4 b3FastNormalize4(float4 v)\n"
+	"{\n"
+	"	v = (float4)(v.xyz,0.f);\n"
+	"	return fast_normalize(v);\n"
+	"}\n"
+	"	\n"
+	"inline b3Quat b3QuatMul(b3Quat a, b3Quat b);\n"
+	"inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
+	"inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
+	"inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+	"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
+	"inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
+	"{\n"
+	"	b3Quat ans;\n"
+	"	ans = b3Cross3( a, b );\n"
+	"	ans += a.w*b+b.w*a;\n"
+	"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+	"	ans.w = a.w*b.w - b3Dot3F4(a, b);\n"
+	"	return ans;\n"
+	"}\n"
+	"inline b3Quat b3QuatNormalized(b3QuatConstArg in)\n"
+	"{\n"
+	"	b3Quat q;\n"
+	"	q=in;\n"
+	"	//return b3FastNormalize4(in);\n"
+	"	float len = native_sqrt(dot(q, q));\n"
+	"	if(len > 0.f)\n"
+	"	{\n"
+	"		q *= 1.f / len;\n"
+	"	}\n"
+	"	else\n"
+	"	{\n"
+	"		q.x = q.y = q.z = 0.f;\n"
+	"		q.w = 1.f;\n"
+	"	}\n"
+	"	return q;\n"
+	"}\n"
+	"inline float4 b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+	"{\n"
+	"	b3Quat qInv = b3QuatInvert( q );\n"
+	"	float4 vcpy = vec;\n"
+	"	vcpy.w = 0.f;\n"
+	"	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
+	"	return out;\n"
+	"}\n"
+	"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+	"{\n"
+	"	return (b3Quat)(-q.xyz, q.w);\n"
+	"}\n"
+	"inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
+	"{\n"
+	"	return (b3Quat)(-q.xyz, q.w);\n"
+	"}\n"
+	"inline float4 b3QuatInvRotate(b3QuatConstArg q, b3QuatConstArg vec)\n"
+	"{\n"
+	"	return b3QuatRotate( b3QuatInvert( q ), vec );\n"
+	"}\n"
+	"inline b3Float4 b3TransformPoint(b3Float4ConstArg point, b3Float4ConstArg translation, b3QuatConstArg  orientation)\n"
+	"{\n"
+	"	return b3QuatRotate( orientation, point ) + (translation);\n"
+	"}\n"
+	"	\n"
+	"#endif \n"
+	"#endif //B3_QUAT_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"typedef struct\n"
+	"{\n"
+	"	b3Float4 m_row[3];\n"
+	"}b3Mat3x3;\n"
+	"#define b3Mat3x3ConstArg const b3Mat3x3\n"
+	"#define b3GetRow(m,row) (m.m_row[row])\n"
+	"inline b3Mat3x3 b3QuatGetRotationMatrix(b3Quat quat)\n"
+	"{\n"
+	"	b3Float4 quat2 = (b3Float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);\n"
+	"	b3Mat3x3 out;\n"
+	"	out.m_row[0].x=1-2*quat2.y-2*quat2.z;\n"
+	"	out.m_row[0].y=2*quat.x*quat.y-2*quat.w*quat.z;\n"
+	"	out.m_row[0].z=2*quat.x*quat.z+2*quat.w*quat.y;\n"
+	"	out.m_row[0].w = 0.f;\n"
+	"	out.m_row[1].x=2*quat.x*quat.y+2*quat.w*quat.z;\n"
+	"	out.m_row[1].y=1-2*quat2.x-2*quat2.z;\n"
+	"	out.m_row[1].z=2*quat.y*quat.z-2*quat.w*quat.x;\n"
+	"	out.m_row[1].w = 0.f;\n"
+	"	out.m_row[2].x=2*quat.x*quat.z-2*quat.w*quat.y;\n"
+	"	out.m_row[2].y=2*quat.y*quat.z+2*quat.w*quat.x;\n"
+	"	out.m_row[2].z=1-2*quat2.x-2*quat2.y;\n"
+	"	out.m_row[2].w = 0.f;\n"
+	"	return out;\n"
+	"}\n"
+	"inline b3Mat3x3 b3AbsoluteMat3x3(b3Mat3x3ConstArg matIn)\n"
+	"{\n"
+	"	b3Mat3x3 out;\n"
+	"	out.m_row[0] = fabs(matIn.m_row[0]);\n"
+	"	out.m_row[1] = fabs(matIn.m_row[1]);\n"
+	"	out.m_row[2] = fabs(matIn.m_row[2]);\n"
+	"	return out;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtZero();\n"
+	"__inline\n"
+	"b3Mat3x3 mtIdentity();\n"
+	"__inline\n"
+	"b3Mat3x3 mtTranspose(b3Mat3x3 m);\n"
+	"__inline\n"
+	"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b);\n"
+	"__inline\n"
+	"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b);\n"
+	"__inline\n"
+	"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b);\n"
+	"__inline\n"
+	"b3Mat3x3 mtZero()\n"
+	"{\n"
+	"	b3Mat3x3 m;\n"
+	"	m.m_row[0] = (b3Float4)(0.f);\n"
+	"	m.m_row[1] = (b3Float4)(0.f);\n"
+	"	m.m_row[2] = (b3Float4)(0.f);\n"
+	"	return m;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtIdentity()\n"
+	"{\n"
+	"	b3Mat3x3 m;\n"
+	"	m.m_row[0] = (b3Float4)(1,0,0,0);\n"
+	"	m.m_row[1] = (b3Float4)(0,1,0,0);\n"
+	"	m.m_row[2] = (b3Float4)(0,0,1,0);\n"
+	"	return m;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtTranspose(b3Mat3x3 m)\n"
+	"{\n"
+	"	b3Mat3x3 out;\n"
+	"	out.m_row[0] = (b3Float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);\n"
+	"	out.m_row[1] = (b3Float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);\n"
+	"	out.m_row[2] = (b3Float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);\n"
+	"	return out;\n"
+	"}\n"
+	"__inline\n"
+	"b3Mat3x3 mtMul(b3Mat3x3 a, b3Mat3x3 b)\n"
+	"{\n"
+	"	b3Mat3x3 transB;\n"
+	"	transB = mtTranspose( b );\n"
+	"	b3Mat3x3 ans;\n"
+	"	//	why this doesn't run when 0ing in the for{}\n"
+	"	a.m_row[0].w = 0.f;\n"
+	"	a.m_row[1].w = 0.f;\n"
+	"	a.m_row[2].w = 0.f;\n"
+	"	for(int i=0; i<3; i++)\n"
+	"	{\n"
+	"//	a.m_row[i].w = 0.f;\n"
+	"		ans.m_row[i].x = b3Dot3F4(a.m_row[i],transB.m_row[0]);\n"
+	"		ans.m_row[i].y = b3Dot3F4(a.m_row[i],transB.m_row[1]);\n"
+	"		ans.m_row[i].z = b3Dot3F4(a.m_row[i],transB.m_row[2]);\n"
+	"		ans.m_row[i].w = 0.f;\n"
+	"	}\n"
+	"	return ans;\n"
+	"}\n"
+	"__inline\n"
+	"b3Float4 mtMul1(b3Mat3x3 a, b3Float4 b)\n"
+	"{\n"
+	"	b3Float4 ans;\n"
+	"	ans.x = b3Dot3F4( a.m_row[0], b );\n"
+	"	ans.y = b3Dot3F4( a.m_row[1], b );\n"
+	"	ans.z = b3Dot3F4( a.m_row[2], b );\n"
+	"	ans.w = 0.f;\n"
+	"	return ans;\n"
+	"}\n"
+	"__inline\n"
+	"b3Float4 mtMul3(b3Float4 a, b3Mat3x3 b)\n"
+	"{\n"
+	"	b3Float4 colx = b3MakeFloat4(b.m_row[0].x, b.m_row[1].x, b.m_row[2].x, 0);\n"
+	"	b3Float4 coly = b3MakeFloat4(b.m_row[0].y, b.m_row[1].y, b.m_row[2].y, 0);\n"
+	"	b3Float4 colz = b3MakeFloat4(b.m_row[0].z, b.m_row[1].z, b.m_row[2].z, 0);\n"
+	"	b3Float4 ans;\n"
+	"	ans.x = b3Dot3F4( a, colx );\n"
+	"	ans.y = b3Dot3F4( a, coly );\n"
+	"	ans.z = b3Dot3F4( a, colz );\n"
+	"	return ans;\n"
+	"}\n"
+	"#endif\n"
+	"#endif //B3_MAT3x3_H\n"
+	"typedef struct b3Aabb b3Aabb_t;\n"
+	"struct b3Aabb\n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		float m_min[4];\n"
+	"		b3Float4 m_minVec;\n"
+	"		int m_minIndices[4];\n"
+	"	};\n"
+	"	union\n"
+	"	{\n"
+	"		float	m_max[4];\n"
+	"		b3Float4 m_maxVec;\n"
+	"		int m_signedMaxIndices[4];\n"
+	"	};\n"
+	"};\n"
+	"inline void b3TransformAabb2(b3Float4ConstArg localAabbMin,b3Float4ConstArg localAabbMax, float margin,\n"
+	"						b3Float4ConstArg pos,\n"
+	"						b3QuatConstArg orn,\n"
+	"						b3Float4* aabbMinOut,b3Float4* aabbMaxOut)\n"
+	"{\n"
+	"		b3Float4 localHalfExtents = 0.5f*(localAabbMax-localAabbMin);\n"
+	"		localHalfExtents+=b3MakeFloat4(margin,margin,margin,0.f);\n"
+	"		b3Float4 localCenter = 0.5f*(localAabbMax+localAabbMin);\n"
+	"		b3Mat3x3 m;\n"
+	"		m = b3QuatGetRotationMatrix(orn);\n"
+	"		b3Mat3x3 abs_b = b3AbsoluteMat3x3(m);\n"
+	"		b3Float4 center = b3TransformPoint(localCenter,pos,orn);\n"
+	"		\n"
+	"		b3Float4 extent = b3MakeFloat4(b3Dot3F4(localHalfExtents,b3GetRow(abs_b,0)),\n"
+	"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,1)),\n"
+	"										 b3Dot3F4(localHalfExtents,b3GetRow(abs_b,2)),\n"
+	"										 0.f);\n"
+	"		*aabbMinOut = center-extent;\n"
+	"		*aabbMaxOut = center+extent;\n"
+	"}\n"
+	"/// conservative test for overlap between two aabbs\n"
+	"inline bool b3TestAabbAgainstAabb(b3Float4ConstArg aabbMin1,b3Float4ConstArg aabbMax1,\n"
+	"								b3Float4ConstArg aabbMin2, b3Float4ConstArg aabbMax2)\n"
+	"{\n"
+	"	bool overlap = true;\n"
+	"	overlap = (aabbMin1.x > aabbMax2.x || aabbMax1.x < aabbMin2.x) ? false : overlap;\n"
+	"	overlap = (aabbMin1.z > aabbMax2.z || aabbMax1.z < aabbMin2.z) ? false : overlap;\n"
+	"	overlap = (aabbMin1.y > aabbMax2.y || aabbMax1.y < aabbMin2.y) ? false : overlap;\n"
+	"	return overlap;\n"
+	"}\n"
+	"#endif //B3_AABB_H\n"
+	"#ifndef B3_COLLIDABLE_H\n"
+	"#define B3_COLLIDABLE_H\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif \n"
+	"#endif //B3_FLOAT4_H\n"
+	"#ifndef B3_QUAT_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif \n"
+	"#endif //B3_QUAT_H\n"
+	"enum b3ShapeTypes\n"
+	"{\n"
+	"	SHAPE_HEIGHT_FIELD=1,\n"
+	"	SHAPE_CONVEX_HULL=3,\n"
+	"	SHAPE_PLANE=4,\n"
+	"	SHAPE_CONCAVE_TRIMESH=5,\n"
+	"	SHAPE_COMPOUND_OF_CONVEX_HULLS=6,\n"
+	"	SHAPE_SPHERE=7,\n"
+	"	MAX_NUM_SHAPE_TYPES,\n"
+	"};\n"
+	"typedef struct b3Collidable b3Collidable_t;\n"
+	"struct b3Collidable\n"
+	"{\n"
+	"	union {\n"
+	"		int m_numChildShapes;\n"
+	"		int m_bvhIndex;\n"
+	"	};\n"
+	"	union\n"
+	"	{\n"
+	"		float m_radius;\n"
+	"		int	m_compoundBvhIndex;\n"
+	"	};\n"
+	"	int m_shapeType;\n"
+	"	union\n"
+	"	{\n"
+	"		int m_shapeIndex;\n"
+	"		float m_height;\n"
+	"	};\n"
+	"};\n"
+	"typedef struct b3GpuChildShape b3GpuChildShape_t;\n"
+	"struct b3GpuChildShape\n"
+	"{\n"
+	"	b3Float4	m_childPosition;\n"
+	"	b3Quat		m_childOrientation;\n"
+	"	union\n"
+	"	{\n"
+	"		int			m_shapeIndex;//used for SHAPE_COMPOUND_OF_CONVEX_HULLS\n"
+	"		int			m_capsuleAxis;\n"
+	"	};\n"
+	"	union \n"
+	"	{\n"
+	"		float		m_radius;//used for childshape of SHAPE_COMPOUND_OF_SPHERES or SHAPE_COMPOUND_OF_CAPSULES\n"
+	"		int			m_numChildShapes;//used for compound shape\n"
+	"	};\n"
+	"	union \n"
+	"	{\n"
+	"		float		m_height;//used for childshape of SHAPE_COMPOUND_OF_CAPSULES\n"
+	"		int	m_collidableShapeIndex;\n"
+	"	};\n"
+	"	int			m_shapeType;\n"
+	"};\n"
+	"struct b3CompoundOverlappingPair\n"
+	"{\n"
+	"	int m_bodyIndexA;\n"
+	"	int m_bodyIndexB;\n"
+	"//	int	m_pairType;\n"
+	"	int m_childShapeIndexA;\n"
+	"	int m_childShapeIndexB;\n"
+	"};\n"
+	"#endif //B3_COLLIDABLE_H\n"
+	"#ifndef B3_RIGIDBODY_DATA_H\n"
+	"#define B3_RIGIDBODY_DATA_H\n"
+	"#ifndef B3_FLOAT4_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif \n"
+	"#endif //B3_FLOAT4_H\n"
+	"#ifndef B3_QUAT_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif \n"
+	"#endif //B3_QUAT_H\n"
+	"#ifndef B3_MAT3x3_H\n"
+	"#ifdef __cplusplus\n"
+	"#else\n"
+	"#endif\n"
+	"#endif //B3_MAT3x3_H\n"
+	"typedef struct b3RigidBodyData b3RigidBodyData_t;\n"
+	"struct b3RigidBodyData\n"
+	"{\n"
+	"	b3Float4				m_pos;\n"
+	"	b3Quat					m_quat;\n"
+	"	b3Float4				m_linVel;\n"
+	"	b3Float4				m_angVel;\n"
+	"	int 					m_collidableIdx;\n"
+	"	float 				m_invMass;\n"
+	"	float 				m_restituitionCoeff;\n"
+	"	float 				m_frictionCoeff;\n"
+	"};\n"
+	"typedef struct b3InertiaData b3InertiaData_t;\n"
+	"struct b3InertiaData\n"
+	"{\n"
+	"	b3Mat3x3 m_invInertiaWorld;\n"
+	"	b3Mat3x3 m_initInvInertia;\n"
+	"};\n"
+	"#endif //B3_RIGIDBODY_DATA_H\n"
+	"	\n"
+	"void b3ComputeWorldAabb(  int bodyId, __global const b3RigidBodyData_t* bodies, __global const  b3Collidable_t* collidables, __global const  b3Aabb_t* localShapeAABB, __global b3Aabb_t* worldAabbs)\n"
+	"{\n"
+	"	__global const b3RigidBodyData_t* body = &bodies[bodyId];\n"
+	"	b3Float4 position = body->m_pos;\n"
+	"	b3Quat	orientation = body->m_quat;\n"
+	"	\n"
+	"	int collidableIndex = body->m_collidableIdx;\n"
+	"	int shapeIndex = collidables[collidableIndex].m_shapeIndex;\n"
+	"		\n"
+	"	if (shapeIndex>=0)\n"
+	"	{\n"
+	"				\n"
+	"		b3Aabb_t localAabb = localShapeAABB[collidableIndex];\n"
+	"		b3Aabb_t worldAabb;\n"
+	"		\n"
+	"		b3Float4 aabbAMinOut,aabbAMaxOut;	\n"
+	"		float margin = 0.f;\n"
+	"		b3TransformAabb2(localAabb.m_minVec,localAabb.m_maxVec,margin,position,orientation,&aabbAMinOut,&aabbAMaxOut);\n"
+	"		\n"
+	"		worldAabb.m_minVec =aabbAMinOut;\n"
+	"		worldAabb.m_minIndices[3] = bodyId;\n"
+	"		worldAabb.m_maxVec = aabbAMaxOut;\n"
+	"		worldAabb.m_signedMaxIndices[3] = body[bodyId].m_invMass==0.f? 0 : 1;\n"
+	"		worldAabbs[bodyId] = worldAabb;\n"
+	"	}\n"
+	"}\n"
+	"#endif //B3_UPDATE_AABBS_H\n"
+	"__kernel void initializeGpuAabbsFull(  const int numNodes, __global b3RigidBodyData_t* gBodies,__global b3Collidable_t* collidables, __global b3Aabb_t* plocalShapeAABB, __global b3Aabb_t* pAABB)\n"
+	"{\n"
+	"	int nodeID = get_global_id(0);\n"
+	"	if( nodeID < numNodes )\n"
+	"	{\n"
+	"		b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n"
+	"	}\n"
+	"}\n"
+	"__kernel void clearOverlappingPairsKernel(  __global int4* pairs, int numPairs)\n"
+	"{\n"
+	"	int pairId = get_global_id(0);\n"
+	"	if( pairId< numPairs )\n"
+	"	{\n"
+	"		pairs[pairId].z = 0xffffffff;\n"
+	"	}\n"
+	"}\n";