Code-style consistency improvement:

Apply clang-format-all.sh using the _clang-format file through all the cpp/.h files. make sure not to apply it to certain serialization structures, since some parser expects the * as part of the name, instead of type. This commit contains no other changes aside from adding and applying clang-format-all.sh
2018-09-23 14:17:31 -07:00
parent b73b05e9fb
commit ab8f16961e
1773 changed files with 1081087 additions and 474249 deletions
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.cpp
@@ -19,149 +19,139 @@ subject to the following restrictions:
 #define KERNEL1 "SearchSortDataUpperKernel"
 #define KERNEL2 "SubtractKernel"

-
 #include "b3BoundSearchCL.h"
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "b3LauncherCL.h"
 #include "kernels/BoundSearchKernelsCL.h"

 b3BoundSearchCL::b3BoundSearchCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int maxSize)
-	:m_context(ctx),
-	m_device(device),
-	m_queue(queue)
+	: m_context(ctx),
+	  m_device(device),
+	  m_queue(queue)
 {
-
 	const char* additionalMacros = "";
 	//const char* srcFileNameForCaching="";

 	cl_int pErrNum;
 	const char* kernelSource = boundSearchKernelsCL;

-	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, BOUNDSEARCH_PATH);
+	cl_program boundSearchProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, BOUNDSEARCH_PATH);
 	b3Assert(boundSearchProg);

-	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg,additionalMacros );
-	b3Assert(m_lowerSortDataKernel );
+	m_lowerSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataLowerKernel", &pErrNum, boundSearchProg, additionalMacros);
+	b3Assert(m_lowerSortDataKernel);

-	m_upperSortDataKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg,additionalMacros );
+	m_upperSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SearchSortDataUpperKernel", &pErrNum, boundSearchProg, additionalMacros);
 	b3Assert(m_upperSortDataKernel);

 	m_subtractKernel = 0;

-	if( maxSize )
+	if (maxSize)
 	{
-		m_subtractKernel= b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg,additionalMacros );
+		m_subtractKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "SubtractKernel", &pErrNum, boundSearchProg, additionalMacros);
 		b3Assert(m_subtractKernel);
 	}

 	//m_constBuffer = new b3OpenCLArray<b3Int4>( device, 1, BufferBase::BUFFER_CONST );
-	
-	m_lower = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue,maxSize );
-	m_upper = (maxSize == 0)? 0: new b3OpenCLArray<unsigned int>(ctx,queue, maxSize );

-	m_filler = new b3FillCL(ctx,device,queue);
+	m_lower = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
+	m_upper = (maxSize == 0) ? 0 : new b3OpenCLArray<unsigned int>(ctx, queue, maxSize);
+
+	m_filler = new b3FillCL(ctx, device, queue);
 }

 b3BoundSearchCL::~b3BoundSearchCL()
 {
-	
 	delete m_lower;
 	delete m_upper;
 	delete m_filler;
-			
+
 	clReleaseKernel(m_lowerSortDataKernel);
 	clReleaseKernel(m_upperSortDataKernel);
 	clReleaseKernel(m_subtractKernel);
-	
-
 }

-
-void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option )
+void b3BoundSearchCL::execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option)
 {
 	b3Int4 constBuffer;
 	constBuffer.x = nSrc;
 	constBuffer.y = nDst;

-	if( option == BOUND_LOWER )
+	if (option == BOUND_LOWER)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL()) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};

-		b3LauncherCL launcher( m_queue, m_lowerSortDataKernel,"m_lowerSortDataKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( nSrc );
-        launcher.setConst( nDst );
-        
-		launcher.launch1D( nSrc, 64 );
+		b3LauncherCL launcher(m_queue, m_lowerSortDataKernel, "m_lowerSortDataKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(nSrc);
+		launcher.setConst(nDst);
+
+		launcher.launch1D(nSrc, 64);
 	}
-	else if( option == BOUND_UPPER )
+	else if (option == BOUND_UPPER)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};

-		b3LauncherCL launcher(m_queue, m_upperSortDataKernel,"m_upperSortDataKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-        launcher.setConst( nSrc );
-        launcher.setConst( nDst );
+		b3LauncherCL launcher(m_queue, m_upperSortDataKernel, "m_upperSortDataKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(nSrc);
+		launcher.setConst(nDst);

-		launcher.launch1D( nSrc, 64 );
+		launcher.launch1D(nSrc, 64);
 	}
-	else if( option == COUNT )
+	else if (option == COUNT)
 	{
-		b3Assert( m_lower );
-		b3Assert( m_upper );
-		b3Assert( m_lower->capacity() <= (int)nDst );
-		b3Assert( m_upper->capacity() <= (int)nDst );
+		b3Assert(m_lower);
+		b3Assert(m_upper);
+		b3Assert(m_lower->capacity() <= (int)nDst);
+		b3Assert(m_upper->capacity() <= (int)nDst);

 		int zero = 0;
-		m_filler->execute( *m_lower, zero, nDst );
-		m_filler->execute( *m_upper, zero, nDst );
+		m_filler->execute(*m_lower, zero, nDst);
+		m_filler->execute(*m_upper, zero, nDst);

-		execute( src, nSrc, *m_lower, nDst, BOUND_LOWER );
-		execute( src, nSrc, *m_upper, nDst, BOUND_UPPER );
+		execute(src, nSrc, *m_lower, nDst, BOUND_LOWER);
+		execute(src, nSrc, *m_upper, nDst, BOUND_UPPER);

 		{
-			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_upper->getBufferCL(), true ), b3BufferInfoCL( m_lower->getBufferCL(), true ), b3BufferInfoCL( dst.getBufferCL() ) };
+			b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_upper->getBufferCL(), true), b3BufferInfoCL(m_lower->getBufferCL(), true), b3BufferInfoCL(dst.getBufferCL())};

-			b3LauncherCL  launcher( m_queue, m_subtractKernel ,"m_subtractKernel");
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-            launcher.setConst( nSrc );
-            launcher.setConst( nDst );
+			b3LauncherCL launcher(m_queue, m_subtractKernel, "m_subtractKernel");
+			launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+			launcher.setConst(nSrc);
+			launcher.setConst(nDst);

-			launcher.launch1D( nDst, 64 );
+			launcher.launch1D(nDst, 64);
 		}
 	}
 	else
 	{
-		b3Assert( 0 );
+		b3Assert(0);
 	}
-
 }

-
-void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, 
-	b3AlignedObjectArray<unsigned int>& dst,  int nDst, Option option )
+void b3BoundSearchCL::executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc,
+								  b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option)
 {
+	for (int i = 0; i < nSrc - 1; i++)
+		b3Assert(src[i].m_key <= src[i + 1].m_key);

-
-	for(int i=0; i<nSrc-1; i++) 
-		b3Assert( src[i].m_key <= src[i+1].m_key );
-
-	b3SortData minData,zeroData,maxData;
+	b3SortData minData, zeroData, maxData;
 	minData.m_key = -1;
 	minData.m_value = -1;
-	zeroData.m_key=0;
-	zeroData.m_value=0;
+	zeroData.m_key = 0;
+	zeroData.m_value = 0;
 	maxData.m_key = nDst;
 	maxData.m_value = nDst;

-	if( option == BOUND_LOWER )
+	if (option == BOUND_LOWER)
 	{
-		for(int i=0; i<nSrc; i++)
+		for (int i = 0; i < nSrc; i++)
 		{
-			b3SortData& iData = (i==0)? minData: src[i-1];
-			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			b3SortData& iData = (i == 0) ? minData : src[i - 1];
+			b3SortData& jData = (i == nSrc) ? maxData : src[i];

-			if( iData.m_key != jData.m_key )
+			if (iData.m_key != jData.m_key)
 			{
 				int k = jData.m_key;
 				{
@@ -170,14 +160,14 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
 			}
 		}
 	}
-	else if( option == BOUND_UPPER )
+	else if (option == BOUND_UPPER)
 	{
-		for(int i=1; i<nSrc+1; i++)
+		for (int i = 1; i < nSrc + 1; i++)
 		{
-			b3SortData& iData = src[i-1];
-			b3SortData& jData = (i==nSrc)? maxData: src[i];
+			b3SortData& iData = src[i - 1];
+			b3SortData& jData = (i == nSrc) ? maxData : src[i];

-			if( iData.m_key != jData.m_key )
+			if (iData.m_key != jData.m_key)
 			{
 				int k = iData.m_key;
 				{
@@ -186,28 +176,28 @@ void b3BoundSearchCL::executeHost( b3AlignedObjectArray<b3SortData>& src, int nS
 			}
 		}
 	}
-	else if( option == COUNT )
+	else if (option == COUNT)
 	{
 		b3AlignedObjectArray<unsigned int> lower;
-		lower.resize(nDst );
+		lower.resize(nDst);
 		b3AlignedObjectArray<unsigned int> upper;
-		upper.resize(nDst );
+		upper.resize(nDst);

-		for(int i=0; i<nDst; i++) 
-		{ 
-			lower[i] = upper[i] = 0; 
+		for (int i = 0; i < nDst; i++)
+		{
+			lower[i] = upper[i] = 0;
 		}

-		executeHost( src, nSrc, lower, nDst, BOUND_LOWER );
-		executeHost( src, nSrc, upper, nDst, BOUND_UPPER );
+		executeHost(src, nSrc, lower, nDst, BOUND_LOWER);
+		executeHost(src, nSrc, upper, nDst, BOUND_UPPER);

-		for( int i=0; i<nDst; i++) 
-		{ 
-			dst[i] = upper[i] - lower[i]; 
+		for (int i = 0; i < nDst; i++)
+		{
+			dst[i] = upper[i] - lower[i];
 		}
 	}
 	else
 	{
-		b3Assert( 0 );
+		b3Assert(0);
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3BoundSearchCL.h
@@ -26,42 +26,39 @@ subject to the following restrictions:

 #include "b3OpenCLArray.h"
 #include "b3FillCL.h"
-#include "b3RadixSort32CL.h" //for b3SortData (perhaps move it?)
+#include "b3RadixSort32CL.h"  //for b3SortData (perhaps move it?)
 class b3BoundSearchCL
 {
-	public:
+public:
+	enum Option
+	{
+		BOUND_LOWER,
+		BOUND_UPPER,
+		COUNT,
+	};

-		enum Option
-		{
-			BOUND_LOWER,
-			BOUND_UPPER,
-			COUNT,
-		};
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;

-		cl_context m_context;
-		cl_device_id m_device;
-		cl_command_queue m_queue;
+	cl_kernel m_lowerSortDataKernel;
+	cl_kernel m_upperSortDataKernel;
+	cl_kernel m_subtractKernel;

-		
-		cl_kernel m_lowerSortDataKernel;
-		cl_kernel m_upperSortDataKernel;
-		cl_kernel m_subtractKernel;
-		
-		b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
-		b3OpenCLArray<unsigned int>* m_lower;
-		b3OpenCLArray<unsigned int>* m_upper;
-		
-		b3FillCL* m_filler;
-		
-		b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);
+	b3OpenCLArray<b3Int4>* m_constbtOpenCLArray;
+	b3OpenCLArray<unsigned int>* m_lower;
+	b3OpenCLArray<unsigned int>* m_upper;

-		virtual ~b3BoundSearchCL();
+	b3FillCL* m_filler;

-		//	src has to be src[i].m_key <= src[i+1].m_key
-		void execute( b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER );
+	b3BoundSearchCL(cl_context context, cl_device_id device, cl_command_queue queue, int size);

-		void executeHost( b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
+	virtual ~b3BoundSearchCL();
+
+	//	src has to be src[i].m_key <= src[i+1].m_key
+	void execute(b3OpenCLArray<b3SortData>& src, int nSrc, b3OpenCLArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
+
+	void executeHost(b3AlignedObjectArray<b3SortData>& src, int nSrc, b3AlignedObjectArray<unsigned int>& dst, int nDst, Option option = BOUND_LOWER);
 };

-
-#endif //B3_BOUNDSEARCH_H
+#endif  //B3_BOUNDSEARCH_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3BufferInfoCL.h
@@ -4,16 +4,15 @@

 #include "b3OpenCLArray.h"

-
 struct b3BufferInfoCL
 {
 	//b3BufferInfoCL(){}

-//	template<typename T>
-	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false): m_clBuffer(buff), m_isReadOnly(isReadOnly){}
+	//	template<typename T>
+	b3BufferInfoCL(cl_mem buff, bool isReadOnly = false) : m_clBuffer(buff), m_isReadOnly(isReadOnly) {}

 	cl_mem m_clBuffer;
 	bool m_isReadOnly;
 };

-#endif //B3_BUFFER_INFO_CL_H
+#endif  //B3_BUFFER_INFO_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.cpp
@@ -8,29 +8,26 @@
 #include "kernels/FillKernelsCL.h"

 b3FillCL::b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* kernelSource = fillKernelsCL;
 	cl_int pErrNum;
 	const char* additionalMacros = "";

-	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, kernelSource, &pErrNum,additionalMacros, FILL_CL_PROGRAM_PATH);
+	cl_program fillProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, kernelSource, &pErrNum, additionalMacros, FILL_CL_PROGRAM_PATH);
 	b3Assert(fillProg);

-	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillIntKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillIntKernel);

-	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillUnsignedIntKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillUnsignedIntKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillIntKernel);

-	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg,additionalMacros );
+	m_fillFloatKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillFloatKernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillFloatKernel);

-	
-
-	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString( ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg,additionalMacros );
+	m_fillKernelInt2 = b3OpenCLUtils::compileCLKernelFromString(ctx, device, kernelSource, "FillInt2Kernel", &pErrNum, fillProg, additionalMacros);
 	b3Assert(m_fillKernelInt2);
-	
 }

 b3FillCL::~b3FillCL()
@@ -39,88 +36,84 @@ b3FillCL::~b3FillCL()
 	clReleaseKernel(m_fillIntKernel);
 	clReleaseKernel(m_fillUnsignedIntKernel);
 	clReleaseKernel(m_fillFloatKernel);
-
 }

 void b3FillCL::execute(b3OpenCLArray<float>& src, const float value, int n, int offset)
 {
-	b3Assert( n>0 );
+	b3Assert(n > 0);

 	{
-		b3LauncherCL launcher( m_commandQueue, m_fillFloatKernel,"m_fillFloatKernel" );
-		launcher.setBuffer( src.getBufferCL());
-		launcher.setConst( n );
-		launcher.setConst( value );
-		launcher.setConst( offset);
+		b3LauncherCL launcher(m_commandQueue, m_fillFloatKernel, "m_fillFloatKernel");
+		launcher.setBuffer(src.getBufferCL());
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);

-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }

 void b3FillCL::execute(b3OpenCLArray<int>& src, const int value, int n, int offset)
 {
-	b3Assert( n>0 );
-	
+	b3Assert(n > 0);

 	{
-		b3LauncherCL launcher( m_commandQueue, m_fillIntKernel ,"m_fillIntKernel");
+		b3LauncherCL launcher(m_commandQueue, m_fillIntKernel, "m_fillIntKernel");
 		launcher.setBuffer(src.getBufferCL());
-		launcher.setConst( n);
-		launcher.setConst( value);
-		launcher.setConst( offset);
-		launcher.launch1D( n );
+		launcher.setConst(n);
+		launcher.setConst(value);
+		launcher.setConst(offset);
+		launcher.launch1D(n);
 	}
 }

-
 void b3FillCL::execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset)
 {
-	b3Assert( n>0 );
+	b3Assert(n > 0);

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};

-		b3LauncherCL launcher( m_commandQueue, m_fillUnsignedIntKernel,"m_fillUnsignedIntKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( n );
-        launcher.setConst(value);
+		b3LauncherCL launcher(m_commandQueue, m_fillUnsignedIntKernel, "m_fillUnsignedIntKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(n);
+		launcher.setConst(value);
 		launcher.setConst(offset);

-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }

-void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+void b3FillCL::executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset)
 {
-	for (int i=0;i<n;i++)
+	for (int i = 0; i < n; i++)
 	{
-		src[i+offset]=value;
+		src[i + offset] = value;
 	}
 }

-void b3FillCL::executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset)
+void b3FillCL::executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset)
 {
-	for (int i=0;i<n;i++)
+	for (int i = 0; i < n; i++)
 	{
-		src[i+offset]=value;
+		src[i + offset] = value;
 	}
 }

-void b3FillCL::execute(b3OpenCLArray<b3Int2> &src, const b3Int2 &value, int n, int offset)
+void b3FillCL::execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset)
 {
-	b3Assert( n>0 );
-	
+	b3Assert(n > 0);

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( src.getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(src.getBufferCL())};

-		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2,"m_fillKernelInt2");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+		b3LauncherCL launcher(m_commandQueue, m_fillKernelInt2, "m_fillKernelInt2");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
 		launcher.setConst(n);
 		launcher.setConst(value);
 		launcher.setConst(offset);

 		//( constBuffer );
-		launcher.launch1D( n );
+		launcher.launch1D(n);
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3FillCL.h
@@ -7,57 +7,46 @@
 #include "Bullet3Common/shared/b3Int2.h"
 #include "Bullet3Common/shared/b3Int4.h"

-
 class b3FillCL
 {
-	
-	cl_command_queue	m_commandQueue;
-	
-	cl_kernel			m_fillKernelInt2;
-	cl_kernel			m_fillIntKernel;
-	cl_kernel			m_fillUnsignedIntKernel;
-	cl_kernel			m_fillFloatKernel;
+	cl_command_queue m_commandQueue;

-	public:
-		
-		struct b3ConstData
-		{
-			union
-			{
-				b3Int4 m_data;
-				b3UnsignedInt4 m_UnsignedData;
-			};
-			int m_offset;
-			int m_n;
-			int m_padding[2];
-		};
-
-protected:
+	cl_kernel m_fillKernelInt2;
+	cl_kernel m_fillIntKernel;
+	cl_kernel m_fillUnsignedIntKernel;
+	cl_kernel m_fillFloatKernel;

 public:
+	struct b3ConstData
+	{
+		union {
+			b3Int4 m_data;
+			b3UnsignedInt4 m_UnsignedData;
+		};
+		int m_offset;
+		int m_n;
+		int m_padding[2];
+	};

-		b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);
+protected:
+public:
+	b3FillCL(cl_context ctx, cl_device_id device, cl_command_queue queue);

-		virtual ~b3FillCL();
+	virtual ~b3FillCL();

-		void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);
-	
-		void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);
+	void execute(b3OpenCLArray<unsigned int>& src, const unsigned int value, int n, int offset = 0);

-		void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);
+	void execute(b3OpenCLArray<int>& src, const int value, int n, int offset = 0);

-		void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);
+	void execute(b3OpenCLArray<float>& src, const float value, int n, int offset = 0);

-		void executeHost(b3AlignedObjectArray<b3Int2> &src, const b3Int2 &value, int n, int offset);
+	void execute(b3OpenCLArray<b3Int2>& src, const b3Int2& value, int n, int offset = 0);

-		void executeHost(b3AlignedObjectArray<int> &src, const int value, int n, int offset);
+	void executeHost(b3AlignedObjectArray<b3Int2>& src, const b3Int2& value, int n, int offset);
+
+	void executeHost(b3AlignedObjectArray<int>& src, const int value, int n, int offset);

 	//	void execute(b3OpenCLArray<b3Int4>& src, const b3Int4& value, int n, int offset = 0);
-
 };
-		
-		
-		
-	

-#endif //B3_FILL_CL_H
+#endif  //B3_FILL_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.cpp
@@ -1,13 +1,13 @@
 #include "b3LauncherCL.h"

 bool gDebugLauncherCL = false;
-    
+
 b3LauncherCL::b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name)
-:m_commandQueue(queue),
-m_kernel(kernel),
-m_idx(0),
-m_enableSerialization(false),
-m_name(name)
+	: m_commandQueue(queue),
+	  m_kernel(kernel),
+	  m_idx(0),
+	  m_enableSerialization(false),
+	  m_name(name)
 {
 	if (gDebugLauncherCL)
 	{
@@ -15,59 +15,58 @@ m_name(name)
 		printf("[%d] Prepare to launch OpenCL kernel %s\n", counter++, name);
 	}

-      m_serializationSizeInBytes = sizeof(int);
+	m_serializationSizeInBytes = sizeof(int);
 }
-    
+
 b3LauncherCL::~b3LauncherCL()
-  {
-      for (int i=0;i<m_arrays.size();i++)
-      {
-		  delete (m_arrays[i]);
-      }
-
-	  m_arrays.clear();
-	  if (gDebugLauncherCL)
-	  {
-		static int counter = 0;
-		printf("[%d] Finished launching OpenCL kernel %s\n", counter++,m_name);
-	  }
-  }
-
-void b3LauncherCL::setBuffer( cl_mem clBuffer)
 {
-		if (m_enableSerialization)
-		{
-			b3KernelArgData kernelArg;
-			kernelArg.m_argIndex = m_idx;
-			kernelArg.m_isBuffer = 1;
-			kernelArg.m_clBuffer = clBuffer;
-		
-			cl_mem_info param_name = CL_MEM_SIZE;
-			size_t param_value;
-			size_t sizeInBytes = sizeof(size_t);
-			size_t actualSizeInBytes;
-			cl_int err;
-			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
-									  param_name,
-									  sizeInBytes,
-									  &param_value,
-									  &actualSizeInBytes);
-			
-			b3Assert( err == CL_SUCCESS );
-			kernelArg.m_argSizeInBytes = param_value;
-			
-			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
-			m_serializationSizeInBytes+=param_value;
-            }
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
-		b3Assert( status == CL_SUCCESS );
+	for (int i = 0; i < m_arrays.size(); i++)
+	{
+		delete (m_arrays[i]);
+	}
+
+	m_arrays.clear();
+	if (gDebugLauncherCL)
+	{
+		static int counter = 0;
+		printf("[%d] Finished launching OpenCL kernel %s\n", counter++, m_name);
+	}
 }

-
-void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
+void b3LauncherCL::setBuffer(cl_mem clBuffer)
 {
-	for(int i=0; i<n; i++)
+	if (m_enableSerialization)
+	{
+		b3KernelArgData kernelArg;
+		kernelArg.m_argIndex = m_idx;
+		kernelArg.m_isBuffer = 1;
+		kernelArg.m_clBuffer = clBuffer;
+
+		cl_mem_info param_name = CL_MEM_SIZE;
+		size_t param_value;
+		size_t sizeInBytes = sizeof(size_t);
+		size_t actualSizeInBytes;
+		cl_int err;
+		err = clGetMemObjectInfo(kernelArg.m_clBuffer,
+								 param_name,
+								 sizeInBytes,
+								 &param_value,
+								 &actualSizeInBytes);
+
+		b3Assert(err == CL_SUCCESS);
+		kernelArg.m_argSizeInBytes = param_value;
+
+		m_kernelArguments.push_back(kernelArg);
+		m_serializationSizeInBytes += sizeof(b3KernelArgData);
+		m_serializationSizeInBytes += param_value;
+	}
+	cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &clBuffer);
+	b3Assert(status == CL_SUCCESS);
+}
+
+void b3LauncherCL::setBuffers(b3BufferInfoCL* buffInfo, int n)
+{
+	for (int i = 0; i < n; i++)
 	{
 		if (m_enableSerialization)
 		{
@@ -75,106 +74,103 @@ void b3LauncherCL::setBuffers( b3BufferInfoCL* buffInfo, int n )
 			kernelArg.m_argIndex = m_idx;
 			kernelArg.m_isBuffer = 1;
 			kernelArg.m_clBuffer = buffInfo[i].m_clBuffer;
-		
+
 			cl_mem_info param_name = CL_MEM_SIZE;
 			size_t param_value;
 			size_t sizeInBytes = sizeof(size_t);
 			size_t actualSizeInBytes;
 			cl_int err;
-			err = clGetMemObjectInfo (	kernelArg.m_clBuffer,
-									  param_name,
-									  sizeInBytes,
-									  &param_value,
-									  &actualSizeInBytes);
-			
-			b3Assert( err == CL_SUCCESS );
+			err = clGetMemObjectInfo(kernelArg.m_clBuffer,
+									 param_name,
+									 sizeInBytes,
+									 &param_value,
+									 &actualSizeInBytes);
+
+			b3Assert(err == CL_SUCCESS);
 			kernelArg.m_argSizeInBytes = param_value;
-			
+
 			m_kernelArguments.push_back(kernelArg);
-			m_serializationSizeInBytes+= sizeof(b3KernelArgData);
-			m_serializationSizeInBytes+=param_value;
-            }
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
-		b3Assert( status == CL_SUCCESS );
-        }
+			m_serializationSizeInBytes += sizeof(b3KernelArgData);
+			m_serializationSizeInBytes += param_value;
+		}
+		cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &buffInfo[i].m_clBuffer);
+		b3Assert(status == CL_SUCCESS);
+	}
 }

 struct b3KernelArgDataUnaligned
 {
-    int m_isBuffer;
-    int m_argIndex;
-    int m_argSizeInBytes;
+	int m_isBuffer;
+	int m_argIndex;
+	int m_argSizeInBytes;
 	int m_unusedPadding;
-    union
-    {
-        cl_mem m_clBuffer;
-        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-    };
-    
+	union {
+		cl_mem m_clBuffer;
+		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+	};
 };
 #include <string.h>

-
-
 int b3LauncherCL::deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx)
 {
-    int index=0;
-    
-    int numArguments = *(int*) &buf[index];
-    index+=sizeof(int);
-    
-    for (int i=0;i<numArguments;i++)
-    {
-        b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
+	int index = 0;

-        index+=sizeof(b3KernelArgData);
-        if (arg->m_isBuffer)
-        {
-            b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx,m_commandQueue, arg->m_argSizeInBytes);
-            clData->resize(arg->m_argSizeInBytes);
-            
-            clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
-            
-            arg->m_clBuffer = clData->getBufferCL();
-            
-            m_arrays.push_back(clData);
-            
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
-		b3Assert( status == CL_SUCCESS );
-            index+=arg->m_argSizeInBytes;
-        } else 
-        {
-            cl_int status = clSetKernelArg( m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
-		b3Assert( status == CL_SUCCESS );
-        }
+	int numArguments = *(int*)&buf[index];
+	index += sizeof(int);
+
+	for (int i = 0; i < numArguments; i++)
+	{
+		b3KernelArgDataUnaligned* arg = (b3KernelArgDataUnaligned*)&buf[index];
+
+		index += sizeof(b3KernelArgData);
+		if (arg->m_isBuffer)
+		{
+			b3OpenCLArray<unsigned char>* clData = new b3OpenCLArray<unsigned char>(ctx, m_commandQueue, arg->m_argSizeInBytes);
+			clData->resize(arg->m_argSizeInBytes);
+
+			clData->copyFromHostPointer(&buf[index], arg->m_argSizeInBytes);
+
+			arg->m_clBuffer = clData->getBufferCL();
+
+			m_arrays.push_back(clData);
+
+			cl_int status = clSetKernelArg(m_kernel, m_idx++, sizeof(cl_mem), &arg->m_clBuffer);
+			b3Assert(status == CL_SUCCESS);
+			index += arg->m_argSizeInBytes;
+		}
+		else
+		{
+			cl_int status = clSetKernelArg(m_kernel, m_idx++, arg->m_argSizeInBytes, &arg->m_argData);
+			b3Assert(status == CL_SUCCESS);
+		}
 		b3KernelArgData b;
-		memcpy(&b,arg,sizeof(b3KernelArgDataUnaligned));
-	m_kernelArguments.push_back(b);
-    }
-m_serializationSizeInBytes = index;
-    return index;
+		memcpy(&b, arg, sizeof(b3KernelArgDataUnaligned));
+		m_kernelArguments.push_back(b);
+	}
+	m_serializationSizeInBytes = index;
+	return index;
 }

 int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx)
-  {
-	 int index=0;
-      
-      int numArguments = *(int*) &goldBuffer[index];
-      index+=sizeof(int);
+{
+	int index = 0;
+
+	int numArguments = *(int*)&goldBuffer[index];
+	index += sizeof(int);

 	if (numArguments != m_kernelArguments.size())
 	{
-		printf("failed validation: expected %d arguments, found %d\n",numArguments, m_kernelArguments.size());
+		printf("failed validation: expected %d arguments, found %d\n", numArguments, m_kernelArguments.size());
 		return -1;
 	}
-      
-      for (int ii=0;ii<numArguments;ii++)
-      {
-          b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];
+
+	for (int ii = 0; ii < numArguments; ii++)
+	{
+		b3KernelArgData* argGold = (b3KernelArgData*)&goldBuffer[index];

 		if (m_kernelArguments[ii].m_argSizeInBytes != argGold->m_argSizeInBytes)
 		{
-			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n",ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
+			printf("failed validation: argument %d sizeInBytes expected: %d, found %d\n", ii, argGold->m_argSizeInBytes, m_kernelArguments[ii].m_argSizeInBytes);
 			return -2;
 		}

@@ -184,125 +180,117 @@ int b3LauncherCL::validateResults(unsigned char* goldBuffer, int goldBufferCapac

 			if (expected != found)
 			{
-				printf("failed validation: argument %d isBuffer expected: %d, found %d\n",ii,expected, found);
+				printf("failed validation: argument %d isBuffer expected: %d, found %d\n", ii, expected, found);
 				return -3;
 			}
 		}
-		index+=sizeof(b3KernelArgData);
+		index += sizeof(b3KernelArgData);

 		if (argGold->m_isBuffer)
-          {
-
-			unsigned char* memBuf= (unsigned char*) malloc(m_kernelArguments[ii].m_argSizeInBytes);
+		{
+			unsigned char* memBuf = (unsigned char*)malloc(m_kernelArguments[ii].m_argSizeInBytes);
 			unsigned char* goldBuf = &goldBuffer[index];
-			for (int j=0;j<m_kernelArguments[j].m_argSizeInBytes;j++)
+			for (int j = 0; j < m_kernelArguments[j].m_argSizeInBytes; j++)
 			{
 				memBuf[j] = 0xaa;
 			}

 			cl_int status = 0;
-			status = clEnqueueReadBuffer( m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
-                                           memBuf, 0,0,0 );
-              b3Assert( status==CL_SUCCESS );
-              clFinish(m_commandQueue);
+			status = clEnqueueReadBuffer(m_commandQueue, m_kernelArguments[ii].m_clBuffer, CL_TRUE, 0, m_kernelArguments[ii].m_argSizeInBytes,
+										 memBuf, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
+			clFinish(m_commandQueue);

-			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
 			{
 				int expected = goldBuf[b];
 				int found = memBuf[b];
 				if (expected != found)
 				{
 					printf("failed validation: argument %d OpenCL data at byte position %d expected: %d, found %d\n",
-						ii, b, expected, found);
+						   ii, b, expected, found);
 					return -4;
 				}
 			}

-              
-              index+=argGold->m_argSizeInBytes;
-          } else 
-          {
-			
+			index += argGold->m_argSizeInBytes;
+		}
+		else
+		{
 			//compare content
-			for (int b=0;b<m_kernelArguments[ii].m_argSizeInBytes;b++)
+			for (int b = 0; b < m_kernelArguments[ii].m_argSizeInBytes; b++)
 			{
 				int expected = argGold->m_argData[b];
-				int found =m_kernelArguments[ii].m_argData[b];
+				int found = m_kernelArguments[ii].m_argData[b];
 				if (expected != found)
 				{
 					printf("failed validation: argument %d const data at byte position %d expected: %d, found %d\n",
-						ii, b, expected, found);
+						   ii, b, expected, found);
 					return -5;
 				}
 			}
-
-          }
-      }
-      return index;
-
+		}
+	}
+	return index;
 }

 int b3LauncherCL::serializeArguments(unsigned char* destBuffer, int destBufferCapacity)
 {
-//initialize to known values
-for (int i=0;i<destBufferCapacity;i++)
-	destBuffer[i] = 0xec;
+	//initialize to known values
+	for (int i = 0; i < destBufferCapacity; i++)
+		destBuffer[i] = 0xec;

-    assert(destBufferCapacity>=m_serializationSizeInBytes);
-    
-    //todo: use the b3Serializer for this to allow for 32/64bit, endianness etc        
-    int numArguments = m_kernelArguments.size();
-    int curBufferSize = 0;
-    int* dest = (int*)&destBuffer[curBufferSize];
-    *dest = numArguments;
-    curBufferSize += sizeof(int);
-    
-    
-    
-    for (int i=0;i<this->m_kernelArguments.size();i++)
-    {
-        b3KernelArgData* arg = (b3KernelArgData*) &destBuffer[curBufferSize];
-        *arg = m_kernelArguments[i];
-        curBufferSize+=sizeof(b3KernelArgData);
-        if (arg->m_isBuffer==1)
-        {
-            //copy the OpenCL buffer content
-            cl_int status = 0;
-            status = clEnqueueReadBuffer( m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
-                                         &destBuffer[curBufferSize], 0,0,0 );
-            b3Assert( status==CL_SUCCESS );
-            clFinish(m_commandQueue);
-            curBufferSize+=arg->m_argSizeInBytes;
-        }
-        
-    }
-    return curBufferSize;
+	assert(destBufferCapacity >= m_serializationSizeInBytes);
+
+	//todo: use the b3Serializer for this to allow for 32/64bit, endianness etc
+	int numArguments = m_kernelArguments.size();
+	int curBufferSize = 0;
+	int* dest = (int*)&destBuffer[curBufferSize];
+	*dest = numArguments;
+	curBufferSize += sizeof(int);
+
+	for (int i = 0; i < this->m_kernelArguments.size(); i++)
+	{
+		b3KernelArgData* arg = (b3KernelArgData*)&destBuffer[curBufferSize];
+		*arg = m_kernelArguments[i];
+		curBufferSize += sizeof(b3KernelArgData);
+		if (arg->m_isBuffer == 1)
+		{
+			//copy the OpenCL buffer content
+			cl_int status = 0;
+			status = clEnqueueReadBuffer(m_commandQueue, arg->m_clBuffer, 0, 0, arg->m_argSizeInBytes,
+										 &destBuffer[curBufferSize], 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
+			clFinish(m_commandQueue);
+			curBufferSize += arg->m_argSizeInBytes;
+		}
+	}
+	return curBufferSize;
 }

 void b3LauncherCL::serializeToFile(const char* fileName, int numWorkItems)
 {
 	int num = numWorkItems;
 	int buffSize = getSerializationBufferSize();
-	unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
-	for (int i=0;i<buffSize+1;i++)
+	unsigned char* buf = new unsigned char[buffSize + sizeof(int)];
+	for (int i = 0; i < buffSize + 1; i++)
 	{
 		unsigned char* ptr = (unsigned char*)&buf[i];
 		*ptr = 0xff;
 	}
-//	int actualWrite = serializeArguments(buf,buffSize);
-              
-//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
-//            printf("buf[buffSize] = %d\n",*cptr);
-              
-	assert(buf[buffSize]==0xff);//check for buffer overrun
+	//	int actualWrite = serializeArguments(buf,buffSize);
+
+	//	unsigned char* cptr = (unsigned char*)&buf[buffSize];
+	//            printf("buf[buffSize] = %d\n",*cptr);
+
+	assert(buf[buffSize] == 0xff);  //check for buffer overrun
 	int* ptr = (int*)&buf[buffSize];
-              
+
 	*ptr = num;
-              
-	FILE* f = fopen(fileName,"wb");
-	fwrite(buf,buffSize+sizeof(int),1,f);
+
+	FILE* f = fopen(fileName, "wb");
+	fwrite(buf, buffSize + sizeof(int), 1, f);
 	fclose(f);

 	delete[] buf;
-}		
-
+}
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h
@@ -9,60 +9,57 @@

 #define B3_DEBUG_SERIALIZE_CL

-
 #ifdef _WIN32
-#pragma warning(disable :4996)
+#pragma warning(disable : 4996)
 #endif
 #define B3_CL_MAX_ARG_SIZE 16
-B3_ATTRIBUTE_ALIGNED16(struct) b3KernelArgData
+B3_ATTRIBUTE_ALIGNED16(struct)
+b3KernelArgData
 {
-    int m_isBuffer;
-    int m_argIndex;
-    int m_argSizeInBytes;
+	int m_isBuffer;
+	int m_argIndex;
+	int m_argSizeInBytes;
 	int m_unusedPadding;
-    union
-    {
-        cl_mem m_clBuffer;
-        unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
-    };
-    
+	union {
+		cl_mem m_clBuffer;
+		unsigned char m_argData[B3_CL_MAX_ARG_SIZE];
+	};
 };

 class b3LauncherCL
 {
-
 	cl_command_queue m_commandQueue;
 	cl_kernel m_kernel;
 	int m_idx;
-	
-    b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
-    int m_serializationSizeInBytes;
-	bool	m_enableSerialization;
+
+	b3AlignedObjectArray<b3KernelArgData> m_kernelArguments;
+	int m_serializationSizeInBytes;
+	bool m_enableSerialization;

 	const char* m_name;
-	public:

-     b3AlignedObjectArray<b3OpenCLArray<unsigned char>* > m_arrays;
-    
-		b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
-    
-    virtual ~b3LauncherCL();
-    
-		void setBuffer( cl_mem clBuffer);
+public:
+	b3AlignedObjectArray<b3OpenCLArray<unsigned char>*> m_arrays;

-		void setBuffers( b3BufferInfoCL* buffInfo, int n );
-    
-    int getSerializationBufferSize() const 
-    {
-        return m_serializationSizeInBytes;
-    }
-    
-    int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);
+	b3LauncherCL(cl_command_queue queue, cl_kernel kernel, const char* name);
+
+	virtual ~b3LauncherCL();
+
+	void setBuffer(cl_mem clBuffer);
+
+	void setBuffers(b3BufferInfoCL* buffInfo, int n);
+
+	int getSerializationBufferSize() const
+	{
+		return m_serializationSizeInBytes;
+	}
+
+	int deserializeArgs(unsigned char* buf, int bufSize, cl_context ctx);

 	inline int validateResults(unsigned char* goldBuffer, int goldBufferCapacity, cl_context ctx);

-    int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
-    
+	int serializeArguments(unsigned char* destBuffer, int destBufferCapacity);
+
 	int getNumArguments() const
 	{
 		return m_kernelArguments.size();
@@ -75,61 +72,57 @@ class b3LauncherCL

 	void serializeToFile(const char* fileName, int numWorkItems);

-	template<typename T>
-		inline void setConst( const T& consts )
-		{
-			int sz=sizeof(T);
-			b3Assert(sz<=B3_CL_MAX_ARG_SIZE);
+	template <typename T>
+	inline void setConst(const T& consts)
+	{
+		int sz = sizeof(T);
+		b3Assert(sz <= B3_CL_MAX_ARG_SIZE);

-			if (m_enableSerialization)
-			{
-				b3KernelArgData kernelArg;
-				kernelArg.m_argIndex = m_idx;
-				kernelArg.m_isBuffer = 0;
-				T* destArg = (T*)kernelArg.m_argData;
-				*destArg = consts;
-				kernelArg.m_argSizeInBytes = sizeof(T);
-				m_kernelArguments.push_back(kernelArg);
-				m_serializationSizeInBytes+=sizeof(b3KernelArgData);
-			}
-            
-			cl_int status = clSetKernelArg( m_kernel, m_idx++, sz, &consts );
-			b3Assert( status == CL_SUCCESS );
+		if (m_enableSerialization)
+		{
+			b3KernelArgData kernelArg;
+			kernelArg.m_argIndex = m_idx;
+			kernelArg.m_isBuffer = 0;
+			T* destArg = (T*)kernelArg.m_argData;
+			*destArg = consts;
+			kernelArg.m_argSizeInBytes = sizeof(T);
+			m_kernelArguments.push_back(kernelArg);
+			m_serializationSizeInBytes += sizeof(b3KernelArgData);
 		}

-		inline void launch1D( int numThreads, int localSize = 64)
-		{
-			launch2D( numThreads, 1, localSize, 1 );
-		}
+		cl_int status = clSetKernelArg(m_kernel, m_idx++, sz, &consts);
+		b3Assert(status == CL_SUCCESS);
+	}

-		inline void launch2D( int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
-		{
-			size_t gRange[3] = {1,1,1};
-			size_t lRange[3] = {1,1,1};
-			lRange[0] = localSizeX;
-			lRange[1] = localSizeY;
-			gRange[0] = b3Max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
-			gRange[0] *= lRange[0];
-			gRange[1] = b3Max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
-			gRange[1] *= lRange[1];
+	inline void launch1D(int numThreads, int localSize = 64)
+	{
+		launch2D(numThreads, 1, localSize, 1);
+	}

-			cl_int status = clEnqueueNDRangeKernel( m_commandQueue, 
-				m_kernel, 2, NULL, gRange, lRange, 0,0,0 );
-            if (status != CL_SUCCESS)
-            {
-                printf("Error: OpenCL status = %d\n",status);
-            }
-			b3Assert( status == CL_SUCCESS );
+	inline void launch2D(int numThreadsX, int numThreadsY, int localSizeX, int localSizeY)
+	{
+		size_t gRange[3] = {1, 1, 1};
+		size_t lRange[3] = {1, 1, 1};
+		lRange[0] = localSizeX;
+		lRange[1] = localSizeY;
+		gRange[0] = b3Max((size_t)1, (numThreadsX / lRange[0]) + (!(numThreadsX % lRange[0]) ? 0 : 1));
+		gRange[0] *= lRange[0];
+		gRange[1] = b3Max((size_t)1, (numThreadsY / lRange[1]) + (!(numThreadsY % lRange[1]) ? 0 : 1));
+		gRange[1] *= lRange[1];

-		}
-	
-		void	enableSerialization(bool serialize)
+		cl_int status = clEnqueueNDRangeKernel(m_commandQueue,
+											   m_kernel, 2, NULL, gRange, lRange, 0, 0, 0);
+		if (status != CL_SUCCESS)
 		{
-			m_enableSerialization = serialize;
+			printf("Error: OpenCL status = %d\n", status);
 		}
-		
+		b3Assert(status == CL_SUCCESS);
+	}
+
+	void enableSerialization(bool serialize)
+	{
+		m_enableSerialization = serialize;
+	}
 };

-
-
-#endif //B3_LAUNCHER_CL_H
+#endif  //B3_LAUNCHER_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h
@@ -7,16 +7,16 @@
 template <typename T>
 class b3OpenCLArray
 {
-	size_t	m_size;
-	size_t	m_capacity;
-	cl_mem	m_clBuffer;
+	size_t m_size;
+	size_t m_capacity;
+	cl_mem m_clBuffer;

-	cl_context		 m_clContext;
+	cl_context m_clContext;
 	cl_command_queue m_commandQueue;

-	bool	m_ownsMemory;
+	bool m_ownsMemory;

-	bool	m_allowGrowingCapacity;
+	bool m_allowGrowingCapacity;

 	void deallocate()
 	{
@@ -25,22 +25,19 @@ class b3OpenCLArray
 			clReleaseMemObject(m_clBuffer);
 		}
 		m_clBuffer = 0;
-		m_capacity=0;
+		m_capacity = 0;
 	}

 	b3OpenCLArray<T>& operator=(const b3OpenCLArray<T>& src);

-	B3_FORCE_INLINE	size_t	allocSize(size_t size)
-		{
-			return (size ? size*2 : 1);
-		}
+	B3_FORCE_INLINE size_t allocSize(size_t size)
+	{
+		return (size ? size * 2 : 1);
+	}

 public:
-
-	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity=0, bool allowGrowingCapacity=true)
-	:m_size(0),  m_capacity(0),m_clBuffer(0),
-	m_clContext(ctx),m_commandQueue(queue),
-	m_ownsMemory(true),m_allowGrowingCapacity(true)
+	b3OpenCLArray(cl_context ctx, cl_command_queue queue, size_t initialCapacity = 0, bool allowGrowingCapacity = true)
+		: m_size(0), m_capacity(0), m_clBuffer(0), m_clContext(ctx), m_commandQueue(queue), m_ownsMemory(true), m_allowGrowingCapacity(true)
 	{
 		if (initialCapacity)
 		{
@@ -60,34 +57,32 @@ public:
 		m_capacity = sizeInElements;
 	}

-// we could enable this assignment, but need to make sure to avoid accidental deep copies
-//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
-//	{
-//		copyFromArray(src);
-//		return *this;
-//	}
+	// we could enable this assignment, but need to make sure to avoid accidental deep copies
+	//	b3OpenCLArray<T>& operator=(const b3AlignedObjectArray<T>& src)
+	//	{
+	//		copyFromArray(src);
+	//		return *this;
+	//	}

-
-	cl_mem	getBufferCL() const
+	cl_mem getBufferCL() const
 	{
 		return m_clBuffer;
 	}

-
 	virtual ~b3OpenCLArray()
 	{
 		deallocate();
-		m_size=0;
-		m_capacity=0;
+		m_size = 0;
+		m_capacity = 0;
 	}

-	B3_FORCE_INLINE	bool push_back(const T& _Val,bool waitForCompletion=true)
+	B3_FORCE_INLINE bool push_back(const T& _Val, bool waitForCompletion = true)
 	{
 		bool result = true;
 		size_t sz = size();
-		if( sz == capacity() )
+		if (sz == capacity())
 		{
-			result = reserve( allocSize(size()) );
+			result = reserve(allocSize(size()));
 		}
 		copyFromHostPointer(&_Val, 1, sz, waitForCompletion);
 		m_size++;
@@ -96,23 +91,23 @@ public:

 	B3_FORCE_INLINE T forcedAt(size_t n) const
 	{
-		b3Assert(n>=0);
-		b3Assert(n<capacity());
+		b3Assert(n >= 0);
+		b3Assert(n < capacity());
 		T elem;
-		copyToHostPointer(&elem,1,n,true);
+		copyToHostPointer(&elem, 1, n, true);
 		return elem;
 	}

 	B3_FORCE_INLINE T at(size_t n) const
 	{
-		b3Assert(n>=0);
-		b3Assert(n<size());
+		b3Assert(n >= 0);
+		b3Assert(n < size());
 		T elem;
-		copyToHostPointer(&elem,1,n,true);
+		copyToHostPointer(&elem, 1, n, true);
 		return elem;
 	}

-	B3_FORCE_INLINE	bool resize(size_t newsize, bool copyOldContents=true)
+	B3_FORCE_INLINE bool resize(size_t newsize, bool copyOldContents = true)
 	{
 		bool result = true;
 		size_t curSize = size();
@@ -120,11 +115,12 @@ public:
 		if (newsize < curSize)
 		{
 			//leave the OpenCL memory for now
-		} else
+		}
+		else
 		{
 			if (newsize > size())
 			{
-				result = reserve(newsize,copyOldContents);
+				result = reserve(newsize, copyOldContents);
 			}

 			//leave new data uninitialized (init in debug mode?)
@@ -134,7 +130,8 @@ public:
 		if (result)
 		{
 			m_size = newsize;
-		} else
+		}
+		else
 		{
 			m_size = 0;
 		}
@@ -146,25 +143,25 @@ public:
 		return m_size;
 	}

-	B3_FORCE_INLINE	size_t capacity() const
+	B3_FORCE_INLINE size_t capacity() const
 	{
 		return m_capacity;
 	}

-	B3_FORCE_INLINE	bool reserve(size_t _Count, bool copyOldContents=true)
+	B3_FORCE_INLINE bool reserve(size_t _Count, bool copyOldContents = true)
 	{
-		bool result=true;
+		bool result = true;
 		// determine new minimum length of allocated storage
 		if (capacity() < _Count)
-		{	// not enough room, reallocate
+		{  // not enough room, reallocate

 			if (m_allowGrowingCapacity)
 			{
 				cl_int ciErrNum;
 				//create a new OpenCL buffer
-				size_t memSizeInBytes = sizeof(T)*_Count;
+				size_t memSizeInBytes = sizeof(T) * _Count;
 				cl_mem buf = clCreateBuffer(m_clContext, CL_MEM_READ_WRITE, memSizeInBytes, NULL, &ciErrNum);
-				if (ciErrNum!=CL_SUCCESS)
+				if (ciErrNum != CL_SUCCESS)
 				{
 					b3Error("OpenCL out-of-memory\n");
 					_Count = 0;
@@ -173,13 +170,13 @@ public:
 //#define B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
 #ifdef B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
 				unsigned char* src = (unsigned char*)malloc(memSizeInBytes);
-				for (size_t i=0;i<memSizeInBytes;i++)
+				for (size_t i = 0; i < memSizeInBytes; i++)
 					src[i] = 0xbb;
-				ciErrNum = clEnqueueWriteBuffer( m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0,0,0 );
-				b3Assert(ciErrNum==CL_SUCCESS);
+				ciErrNum = clEnqueueWriteBuffer(m_commandQueue, buf, CL_TRUE, 0, memSizeInBytes, src, 0, 0, 0);
+				b3Assert(ciErrNum == CL_SUCCESS);
 				clFinish(m_commandQueue);
 				free(src);
-#endif //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS
+#endif  //B3_ALWAYS_INITIALIZE_OPENCL_BUFFERS

 				if (result)
 				{
@@ -193,21 +190,21 @@ public:
 				m_clBuffer = buf;

 				m_capacity = _Count;
-			} else
+			}
+			else
 			{
 				//fail: assert and
 				b3Assert(0);
 				deallocate();
-				result=false;
+				result = false;
 			}
 		}
 		return result;
 	}

-
-	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem=0, size_t dstOffsetInElems=0) const
+	void copyToCL(cl_mem destination, size_t numElements, size_t firstElem = 0, size_t dstOffsetInElems = 0) const
 	{
-		if (numElements<=0)
+		if (numElements <= 0)
 			return;

 		b3Assert(m_clBuffer);
@@ -216,75 +213,74 @@ public:
 		//likely some error, destination is same as source
 		b3Assert(m_clBuffer != destination);

-		b3Assert((firstElem+numElements)<=m_size);
+		b3Assert((firstElem + numElements) <= m_size);

 		cl_int status = 0;

+		b3Assert(numElements > 0);
+		b3Assert(numElements <= m_size);

-		b3Assert(numElements>0);
-		b3Assert(numElements<=m_size);
+		size_t srcOffsetBytes = sizeof(T) * firstElem;
+		size_t dstOffsetInBytes = sizeof(T) * dstOffsetInElems;

-		size_t srcOffsetBytes = sizeof(T)*firstElem;
-		size_t dstOffsetInBytes = sizeof(T)*dstOffsetInElems;
+		status = clEnqueueCopyBuffer(m_commandQueue, m_clBuffer, destination,
+									 srcOffsetBytes, dstOffsetInBytes, sizeof(T) * numElements, 0, 0, 0);

-		status = clEnqueueCopyBuffer( m_commandQueue, m_clBuffer, destination,
-			srcOffsetBytes, dstOffsetInBytes, sizeof(T)*numElements, 0, 0, 0 );
-
-		b3Assert( status == CL_SUCCESS );
+		b3Assert(status == CL_SUCCESS);
 	}

-	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion=true)
+	void copyFromHost(const b3AlignedObjectArray<T>& srcArray, bool waitForCompletion = true)
 	{
 		size_t newSize = srcArray.size();

 		bool copyOldContents = false;
-		resize (newSize,copyOldContents);
+		resize(newSize, copyOldContents);
 		if (newSize)
-			copyFromHostPointer(&srcArray[0],newSize,0,waitForCompletion);
-
+			copyFromHostPointer(&srcArray[0], newSize, 0, waitForCompletion);
 	}

-	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem= 0, bool waitForCompletion=true)
+	void copyFromHostPointer(const T* src, size_t numElems, size_t destFirstElem = 0, bool waitForCompletion = true)
 	{
-		b3Assert(numElems+destFirstElem <= capacity());
+		b3Assert(numElems + destFirstElem <= capacity());

-		if (numElems+destFirstElem)
+		if (numElems + destFirstElem)
 		{
 			cl_int status = 0;
-			size_t sizeInBytes=sizeof(T)*numElems;
-			status = clEnqueueWriteBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*destFirstElem, sizeInBytes,
-			src, 0,0,0 );
-			b3Assert(status == CL_SUCCESS );
+			size_t sizeInBytes = sizeof(T) * numElems;
+			status = clEnqueueWriteBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * destFirstElem, sizeInBytes,
+										  src, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);
 			if (waitForCompletion)
 				clFinish(m_commandQueue);
-		} else
+		}
+		else
 		{
 			b3Error("copyFromHostPointer invalid range\n");
 		}
 	}

-
-	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion=true) const
+	void copyToHost(b3AlignedObjectArray<T>& destArray, bool waitForCompletion = true) const
 	{
 		destArray.resize(this->size());
 		if (size())
-			copyToHostPointer(&destArray[0], size(),0,waitForCompletion);
+			copyToHostPointer(&destArray[0], size(), 0, waitForCompletion);
 	}

-	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem=0, bool waitForCompletion=true) const
+	void copyToHostPointer(T* destPtr, size_t numElem, size_t srcFirstElem = 0, bool waitForCompletion = true) const
 	{
-		b3Assert(numElem+srcFirstElem <= capacity());
+		b3Assert(numElem + srcFirstElem <= capacity());

-		if(numElem+srcFirstElem <= capacity())
+		if (numElem + srcFirstElem <= capacity())
 		{
 			cl_int status = 0;
-			status = clEnqueueReadBuffer( m_commandQueue, m_clBuffer, 0, sizeof(T)*srcFirstElem, sizeof(T)*numElem,
-			destPtr, 0,0,0 );
-			b3Assert( status==CL_SUCCESS );
+			status = clEnqueueReadBuffer(m_commandQueue, m_clBuffer, 0, sizeof(T) * srcFirstElem, sizeof(T) * numElem,
+										 destPtr, 0, 0, 0);
+			b3Assert(status == CL_SUCCESS);

 			if (waitForCompletion)
 				clFinish(m_commandQueue);
-		} else
+		}
+		else
 		{
 			b3Error("copyToHostPointer invalid range\n");
 		}
@@ -296,11 +292,9 @@ public:
 		resize(newSize);
 		if (size())
 		{
-			src.copyToCL(m_clBuffer,size());
+			src.copyToCL(m_clBuffer, size());
 		}
 	}
-
 };

-
-#endif //B3_OPENCL_ARRAY_H
+#endif  //B3_OPENCL_ARRAY_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.cpp
@@ -7,25 +7,24 @@
 #include "kernels/PrefixScanKernelsCL.h"

 b3PrefixScanCL::b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* scanKernelSource = prefixScanKernelsCL;
 	cl_int pErrNum;
-	char* additionalMacros=0;
+	char* additionalMacros = 0;

-	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx,queue,size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_PROG_PATH);
+	m_workBuffer = new b3OpenCLArray<unsigned int>(ctx, queue, size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_PROG_PATH);
 	b3Assert(scanProg);

-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_localScanKernel );
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_blockSumKernel );
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_propagationKernel );
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_localScanKernel);
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_blockSumKernel);
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_propagationKernel);
 }

-
 b3PrefixScanCL::~b3PrefixScanCL()
 {
 	delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanCL::~b3PrefixScanCL()
 	clReleaseKernel(m_propagationKernel);
 }

-template<class T>
+template <class T>
 T b3NextPowerOf2(T n)
 {
 	n -= 1;
-	for(int i=0; i<sizeof(T)*8; i++)
-		n = n | (n>>i);
-	return n+1;
+	for (int i = 0; i < sizeof(T) * 8; i++)
+		n = n | (n >> i);
+	return n + 1;
 }

 void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum)
 {
-	
-//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));

 	dst.resize(src.size());
 	m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanCL::execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<uns
 	b3Int4 constBuffer;
 	constBuffer.x = n;
 	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	constBuffer.z = (int)b3NextPowerOf2(numBlocks);

 	b3OpenCLArray<unsigned int>* srcNative = &src;
 	b3OpenCLArray<unsigned int>* dstNative = &dst;
-	
-	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };

-		b3LauncherCL launcher( m_commandQueue, m_localScanKernel,"m_localScanKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst(  constBuffer );
-		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	{
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+
+		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
 	}

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};

-		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel,"m_blockSumKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
 	}
-	

-	if( numBlocks > 1 )
+	if (numBlocks > 1)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
-		b3LauncherCL launcher( m_commandQueue, m_propagationKernel,"m_propagationKernel" );
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
 	}

-
-	if( sum )
+	if (sum)
 	{
 		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum,1,n-1,true);
+		dstNative->copyToHostPointer(sum, 1, n - 1, true);
 	}
-
 }

-
 void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum)
 {
 	unsigned int s = 0;
 	//if( data->m_option == EXCLUSIVE )
 	{
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
 			dst[i] = s;
 			s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanCL::executeHost(b3AlignedObjectArray<unsigned int>& src, b3Alig
 	}
 	*/

-	if( sum )
+	if (sum)
 	{
-		*sum = dst[n-1];
+		*sum = dst[n - 1];
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanCL.h
@@ -13,9 +13,9 @@ class b3PrefixScanCL
 		BLOCK_SIZE = 128
 	};

-//	Option m_option;
+	//	Option m_option;

-	cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;

 	cl_kernel m_localScanKernel;
 	cl_kernel m_blockSumKernel;
@@ -23,15 +23,13 @@ class b3PrefixScanCL

 	b3OpenCLArray<unsigned int>* m_workBuffer;

-
-	public:
-		
-	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+public:
+	b3PrefixScanCL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);

 	virtual ~b3PrefixScanCL();

 	void execute(b3OpenCLArray<unsigned int>& src, b3OpenCLArray<unsigned int>& dst, int n, unsigned int* sum = 0);
-	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum=0);
+	void executeHost(b3AlignedObjectArray<unsigned int>& src, b3AlignedObjectArray<unsigned int>& dst, int n, unsigned int* sum = 0);
 };

-#endif //B3_PREFIX_SCAN_CL_H
+#endif  //B3_PREFIX_SCAN_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.cpp
@@ -7,25 +7,24 @@
 #include "kernels/PrefixScanKernelsFloat4CL.h"

 b3PrefixScanFloat4CL::b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size)
-:m_commandQueue(queue)
+	: m_commandQueue(queue)
 {
 	const char* scanKernelSource = prefixScanKernelsFloat4CL;
 	cl_int pErrNum;
-	char* additionalMacros=0;
+	char* additionalMacros = 0;

-	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx,queue,size);
-	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, scanKernelSource, &pErrNum,additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
+	m_workBuffer = new b3OpenCLArray<b3Vector3>(ctx, queue, size);
+	cl_program scanProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, scanKernelSource, &pErrNum, additionalMacros, B3_PREFIXSCAN_FLOAT4_PROG_PATH);
 	b3Assert(scanProg);

-	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_localScanKernel );
-	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_blockSumKernel );
-	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString( ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg,additionalMacros );
-	b3Assert(m_propagationKernel );
+	m_localScanKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "LocalScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_localScanKernel);
+	m_blockSumKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "TopLevelScanKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_blockSumKernel);
+	m_propagationKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, scanKernelSource, "AddOffsetKernel", &pErrNum, scanProg, additionalMacros);
+	b3Assert(m_propagationKernel);
 }

-
 b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
 {
 	delete m_workBuffer;
@@ -34,20 +33,19 @@ b3PrefixScanFloat4CL::~b3PrefixScanFloat4CL()
 	clReleaseKernel(m_propagationKernel);
 }

-template<class T>
+template <class T>
 T b3NextPowerOf2(T n)
 {
 	n -= 1;
-	for(int i=0; i<sizeof(T)*8; i++)
-		n = n | (n>>i);
-	return n+1;
+	for (int i = 0; i < sizeof(T) * 8; i++)
+		n = n | (n >> i);
+	return n + 1;
 }

 void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<b3Vector3>& dst, int n, b3Vector3* sum)
 {
-	
-//	b3Assert( data->m_option == EXCLUSIVE );
-	const unsigned int numBlocks = (const unsigned int)( (n+BLOCK_SIZE*2-1)/(BLOCK_SIZE*2) );
+	//	b3Assert( data->m_option == EXCLUSIVE );
+	const unsigned int numBlocks = (const unsigned int)((n + BLOCK_SIZE * 2 - 1) / (BLOCK_SIZE * 2));

 	dst.resize(src.size());
 	m_workBuffer->resize(src.size());
@@ -55,55 +53,51 @@ void b3PrefixScanFloat4CL::execute(b3OpenCLArray<b3Vector3>& src, b3OpenCLArray<
 	b3Int4 constBuffer;
 	constBuffer.x = n;
 	constBuffer.y = numBlocks;
-	constBuffer.z = (int)b3NextPowerOf2( numBlocks );
+	constBuffer.z = (int)b3NextPowerOf2(numBlocks);

 	b3OpenCLArray<b3Vector3>* srcNative = &src;
 	b3OpenCLArray<b3Vector3>* dstNative = &dst;
-	
-	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( srcNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };

-		b3LauncherCL launcher( m_commandQueue, m_localScanKernel ,"m_localScanKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst(  constBuffer );
-		launcher.launch1D( numBlocks*BLOCK_SIZE, BLOCK_SIZE );
+	{
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(srcNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+
+		b3LauncherCL launcher(m_commandQueue, m_localScanKernel, "m_localScanKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(numBlocks * BLOCK_SIZE, BLOCK_SIZE);
 	}

 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_workBuffer->getBufferCL())};

-		b3LauncherCL launcher( m_commandQueue, m_blockSumKernel ,"m_blockSumKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( BLOCK_SIZE, BLOCK_SIZE );
+		b3LauncherCL launcher(m_commandQueue, m_blockSumKernel, "m_blockSumKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D(BLOCK_SIZE, BLOCK_SIZE);
 	}
-	

-	if( numBlocks > 1 )
+	if (numBlocks > 1)
 	{
-		b3BufferInfoCL bInfo[] = { b3BufferInfoCL( dstNative->getBufferCL() ), b3BufferInfoCL( m_workBuffer->getBufferCL() ) };
-		b3LauncherCL launcher( m_commandQueue, m_propagationKernel ,"m_propagationKernel");
-		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
-		launcher.setConst( constBuffer );
-		launcher.launch1D( (numBlocks-1)*BLOCK_SIZE, BLOCK_SIZE );
+		b3BufferInfoCL bInfo[] = {b3BufferInfoCL(dstNative->getBufferCL()), b3BufferInfoCL(m_workBuffer->getBufferCL())};
+		b3LauncherCL launcher(m_commandQueue, m_propagationKernel, "m_propagationKernel");
+		launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
+		launcher.setConst(constBuffer);
+		launcher.launch1D((numBlocks - 1) * BLOCK_SIZE, BLOCK_SIZE);
 	}

-
-	if( sum )
+	if (sum)
 	{
 		clFinish(m_commandQueue);
-		dstNative->copyToHostPointer(sum,1,n-1,true);
+		dstNative->copyToHostPointer(sum, 1, n - 1, true);
 	}
-
 }

-
 void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum)
 {
-	b3Vector3 s=b3MakeVector3(0,0,0);
+	b3Vector3 s = b3MakeVector3(0, 0, 0);
 	//if( data->m_option == EXCLUSIVE )
 	{
-		for(int i=0; i<n; i++)
+		for (int i = 0; i < n; i++)
 		{
 			dst[i] = s;
 			s += src[i];
@@ -119,8 +113,8 @@ void b3PrefixScanFloat4CL::executeHost(b3AlignedObjectArray<b3Vector3>& src, b3A
 	}
 	*/

-	if( sum )
+	if (sum)
 	{
-		*sum = dst[n-1];
+		*sum = dst[n - 1];
 	}
 }
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h
@@ -14,9 +14,9 @@ class b3PrefixScanFloat4CL
 		BLOCK_SIZE = 128
 	};

-//	Option m_option;
+	//	Option m_option;

-	cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;

 	cl_kernel m_localScanKernel;
 	cl_kernel m_blockSumKernel;
@@ -24,10 +24,8 @@ class b3PrefixScanFloat4CL

 	b3OpenCLArray<b3Vector3>* m_workBuffer;

-
-	public:
-		
-	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue,int size=0);
+public:
+	b3PrefixScanFloat4CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int size = 0);

 	virtual ~b3PrefixScanFloat4CL();

@@ -35,4 +33,4 @@ class b3PrefixScanFloat4CL
 	void executeHost(b3AlignedObjectArray<b3Vector3>& src, b3AlignedObjectArray<b3Vector3>& dst, int n, b3Vector3* sum);
 };

-#endif //B3_PREFIX_SCAN_CL_H
+#endif  //B3_PREFIX_SCAN_CL_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.cpp
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
@@ -6,90 +6,79 @@

 struct b3SortData
 {
-	union
-	{
+	union {
 		unsigned int m_key;
 		unsigned int x;
 	};

-	union
-	{
+	union {
 		unsigned int m_value;
 		unsigned int y;
-		
 	};
 };
 #include "b3BufferInfoCL.h"

-class  b3RadixSort32CL
+class b3RadixSort32CL
 {
+	b3OpenCLArray<unsigned int>* m_workBuffer1;
+	b3OpenCLArray<unsigned int>* m_workBuffer2;

-		b3OpenCLArray<unsigned int>* m_workBuffer1;
-		b3OpenCLArray<unsigned int>* m_workBuffer2;
-		
-		b3OpenCLArray<b3SortData>*	m_workBuffer3;
-		b3OpenCLArray<b3SortData>*	m_workBuffer4;
+	b3OpenCLArray<b3SortData>* m_workBuffer3;
+	b3OpenCLArray<b3SortData>* m_workBuffer4;

-		b3OpenCLArray<unsigned int>* m_workBuffer3a;
-		b3OpenCLArray<unsigned int>* m_workBuffer4a;
+	b3OpenCLArray<unsigned int>* m_workBuffer3a;
+	b3OpenCLArray<unsigned int>* m_workBuffer4a;

-		cl_command_queue	m_commandQueue;
+	cl_command_queue m_commandQueue;

-		cl_kernel m_streamCountSortDataKernel;
-		cl_kernel m_streamCountKernel;
+	cl_kernel m_streamCountSortDataKernel;
+	cl_kernel m_streamCountKernel;

-		cl_kernel m_prefixScanKernel;
-		cl_kernel m_sortAndScatterSortDataKernel;
-		cl_kernel m_sortAndScatterKernel;
+	cl_kernel m_prefixScanKernel;
+	cl_kernel m_sortAndScatterSortDataKernel;
+	cl_kernel m_sortAndScatterKernel;

+	bool m_deviceCPU;

-		bool	m_deviceCPU;
-
-		class b3PrefixScanCL* m_scan;
-		class b3FillCL*	m_fill;
+	class b3PrefixScanCL* m_scan;
+	class b3FillCL* m_fill;

 public:
 	struct b3ConstData
-		{
-			int m_n;
-			int m_nWGs;
-			int m_startBit;
-			int m_nBlocksPerWG;
-		};
+	{
+		int m_n;
+		int m_nWGs;
+		int m_startBit;
+		int m_nBlocksPerWG;
+	};
 	enum
-		{
-			DATA_ALIGNMENT = 256,
-			WG_SIZE = 64,
-            BLOCK_SIZE = 256,
-			ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE/WG_SIZE),
-			BITS_PER_PASS = 4,
-			NUM_BUCKET=(1<<BITS_PER_PASS),
-			//	if you change this, change nPerWI in kernel as well
-			NUM_WGS = 20*6,	//	cypress
-//			NUM_WGS = 24*6,	//	cayman
-//			NUM_WGS = 32*4,	//	nv
-		};
-
+	{
+		DATA_ALIGNMENT = 256,
+		WG_SIZE = 64,
+		BLOCK_SIZE = 256,
+		ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE),
+		BITS_PER_PASS = 4,
+		NUM_BUCKET = (1 << BITS_PER_PASS),
+		//	if you change this, change nPerWI in kernel as well
+		NUM_WGS = 20 * 6,  //	cypress
+						   //			NUM_WGS = 24*6,	//	cayman
+						   //			NUM_WGS = 32*4,	//	nv
+	};

 private:
-		
-
 public:
+	b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0);

-		b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity =0);
+	virtual ~b3RadixSort32CL();

-		virtual ~b3RadixSort32CL();
+	void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn,
+				 b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);

-		void execute(b3OpenCLArray<unsigned int>& keysIn, b3OpenCLArray<unsigned int>& keysOut, b3OpenCLArray<unsigned int>& valuesIn, 
-								b3OpenCLArray<unsigned int>& valuesOut, int n, int sortBits = 32);
-
-		///keys only
-		void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits  = 32 );
-
-		void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits  = 32 );
-		void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
-		void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	///keys only
+	void execute(b3OpenCLArray<unsigned int>& keysInOut, int sortBits = 32);

+	void execute(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	void executeHost(b3OpenCLArray<b3SortData>& keyValuesInOut, int sortBits = 32);
+	void executeHost(b3AlignedObjectArray<b3SortData>& keyValuesInOut, int sortBits = 32);
 };
-#endif //B3_RADIXSORT32_H
-
+#endif  //B3_RADIXSORT32_H
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/BoundSearchKernelsCL.h
@@ -1,87 +1,86 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* boundSearchKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"typedef struct\n"
-"{\n"
-"	u32 m_key; \n"
-"	u32 m_value;\n"
-"}SortData;\n"
-"typedef struct\n"
-"{\n"
-"	u32 m_nSrc;\n"
-"	u32 m_nDst;\n"
-"	u32 m_padding[2];\n"
-"} ConstBuffer;\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < nSrc )\n"
-"	{\n"
-"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
-"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
-"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"		if( iData.m_key != jData.m_key )\n"
-"		{\n"
-"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
-"			u32 k = jData.m_key;\n"
-"			{\n"
-"				dst[k] = gIdx;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX+1;\n"
-"	if( gIdx < nSrc+1 )\n"
-"	{\n"
-"		SortData first; first.m_key = 0; first.m_value = 0;\n"
-"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
-"		SortData iData = src[gIdx-1];\n"
-"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
-"		if( iData.m_key != jData.m_key )\n"
-"		{\n"
-"			u32 k = iData.m_key;\n"
-"			{\n"
-"				dst[k] = gIdx;\n"
-"			}\n"
-"		}\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"__kernel\n"
-"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
-"					unsigned int nSrc, unsigned int nDst)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	\n"
-"	if( gIdx < nDst )\n"
-"	{\n"
-"		C[gIdx] = A[gIdx] - B[gIdx];\n"
-"	}\n"
-"}\n"
-;
+static const char* boundSearchKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_key; \n"
+	"	u32 m_value;\n"
+	"}SortData;\n"
+	"typedef struct\n"
+	"{\n"
+	"	u32 m_nSrc;\n"
+	"	u32 m_nDst;\n"
+	"	u32 m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < nSrc )\n"
+	"	{\n"
+	"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+	"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+	"		if( iData.m_key != jData.m_key )\n"
+	"		{\n"
+	"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+	"			u32 k = jData.m_key;\n"
+	"			{\n"
+	"				dst[k] = gIdx;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX+1;\n"
+	"	if( gIdx < nSrc+1 )\n"
+	"	{\n"
+	"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+	"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+	"		SortData iData = src[gIdx-1];\n"
+	"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+	"		if( iData.m_key != jData.m_key )\n"
+	"		{\n"
+	"			u32 k = iData.m_key;\n"
+	"			{\n"
+	"				dst[k] = gIdx;\n"
+	"			}\n"
+	"		}\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"__kernel\n"
+	"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+	"					unsigned int nSrc, unsigned int nDst)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	\n"
+	"	if( gIdx < nDst )\n"
+	"	{\n"
+	"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/CopyKernelsCL.h
@@ -1,132 +1,131 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* copyKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"\n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"\n"
-"#define make_uint4 (uint4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"\n"
-"typedef struct\n"
-"{\n"
-"	int m_n;\n"
-"	int m_padding[3];\n"
-"} ConstBuffer;\n"
-"\n"
-"\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float4 a0 = src[gIdx];\n"
-"\n"
-"		dst[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( 2*gIdx <= cb.m_n )\n"
-"	{\n"
-"		float4 a0 = src[gIdx*2+0];\n"
-"		float4 a1 = src[gIdx*2+1];\n"
-"\n"
-"		dst[ gIdx*2+0 ] = a0;\n"
-"		dst[ gIdx*2+1 ] = a1;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( 4*gIdx <= cb.m_n )\n"
-"	{\n"
-"		int idx0 = gIdx*4+0;\n"
-"		int idx1 = gIdx*4+1;\n"
-"		int idx2 = gIdx*4+2;\n"
-"		int idx3 = gIdx*4+3;\n"
-"\n"
-"		float4 a0 = src[idx0];\n"
-"		float4 a1 = src[idx1];\n"
-"		float4 a2 = src[idx2];\n"
-"		float4 a3 = src[idx3];\n"
-"\n"
-"		dst[ idx0 ] = a0;\n"
-"		dst[ idx1 ] = a1;\n"
-"		dst[ idx2 ] = a2;\n"
-"		dst[ idx3 ] = a3;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float a0 = srcF1[gIdx];\n"
-"\n"
-"		dstF1[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
-"					ConstBuffer cb)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"\n"
-"	if( gIdx < cb.m_n )\n"
-"	{\n"
-"		float2 a0 = srcF2[gIdx];\n"
-"\n"
-"		dstF2[ gIdx ] = a0;\n"
-"	}\n"
-"}\n"
-"\n"
-"\n"
-;
+static const char* copyKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"\n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"\n"
+	"typedef struct\n"
+	"{\n"
+	"	int m_n;\n"
+	"	int m_padding[3];\n"
+	"} ConstBuffer;\n"
+	"\n"
+	"\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float4 a0 = src[gIdx];\n"
+	"\n"
+	"		dst[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( 2*gIdx <= cb.m_n )\n"
+	"	{\n"
+	"		float4 a0 = src[gIdx*2+0];\n"
+	"		float4 a1 = src[gIdx*2+1];\n"
+	"\n"
+	"		dst[ gIdx*2+0 ] = a0;\n"
+	"		dst[ gIdx*2+1 ] = a1;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( 4*gIdx <= cb.m_n )\n"
+	"	{\n"
+	"		int idx0 = gIdx*4+0;\n"
+	"		int idx1 = gIdx*4+1;\n"
+	"		int idx2 = gIdx*4+2;\n"
+	"		int idx3 = gIdx*4+3;\n"
+	"\n"
+	"		float4 a0 = src[idx0];\n"
+	"		float4 a1 = src[idx1];\n"
+	"		float4 a2 = src[idx2];\n"
+	"		float4 a3 = src[idx3];\n"
+	"\n"
+	"		dst[ idx0 ] = a0;\n"
+	"		dst[ idx1 ] = a1;\n"
+	"		dst[ idx2 ] = a2;\n"
+	"		dst[ idx3 ] = a3;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float a0 = srcF1[gIdx];\n"
+	"\n"
+	"		dstF1[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+	"					ConstBuffer cb)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"\n"
+	"	if( gIdx < cb.m_n )\n"
+	"	{\n"
+	"		float2 a0 = srcF2[gIdx];\n"
+	"\n"
+	"		dstF2[ gIdx ] = a0;\n"
+	"	}\n"
+	"}\n"
+	"\n"
+	"\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/FillKernelsCL.h
@@ -1,91 +1,90 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* fillKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
-"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
-"#define AtomInc(x) atom_inc(&(x))\n"
-"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
-"#define make_uint4 (uint4)\n"
-"#define make_uint2 (uint2)\n"
-"#define make_int2 (int2)\n"
-"typedef struct\n"
-"{\n"
-"	union\n"
-"	{\n"
-"		int4 m_data;\n"
-"		uint4 m_unsignedData;\n"
-"		float	m_floatData;\n"
-"	};\n"
-"	int m_offset;\n"
-"	int m_n;\n"
-"	int m_padding[2];\n"
-"} ConstBuffer;\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num_elements )\n"
-"	{\n"
-"		dstInt[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num_elements )\n"
-"	{\n"
-"		dstFloat[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
-"	}\n"
-"}\n"
-"__kernel\n"
-"__attribute__((reqd_work_group_size(64,1,1)))\n"
-"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
-"{\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	if( gIdx < num )\n"
-"	{\n"
-"		dstInt4[ offset+gIdx ] = value;\n"
-"	}\n"
-"}\n"
-;
+static const char* fillKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+	"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+	"#define AtomInc(x) atom_inc(&(x))\n"
+	"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+	"#define make_uint4 (uint4)\n"
+	"#define make_uint2 (uint2)\n"
+	"#define make_int2 (int2)\n"
+	"typedef struct\n"
+	"{\n"
+	"	union\n"
+	"	{\n"
+	"		int4 m_data;\n"
+	"		uint4 m_unsignedData;\n"
+	"		float	m_floatData;\n"
+	"	};\n"
+	"	int m_offset;\n"
+	"	int m_n;\n"
+	"	int m_padding[2];\n"
+	"} ConstBuffer;\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillIntKernel(__global int* dstInt, 			int num_elements, int value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num_elements )\n"
+	"	{\n"
+	"		dstInt[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillFloatKernel(__global float* dstFloat, 	int num_elements, float value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num_elements )\n"
+	"	{\n"
+	"		dstFloat[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillUnsignedIntKernel(__global unsigned int* dstInt, const int num, const unsigned int value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillInt2Kernel(__global int2* dstInt2, 	const int num, const int2 value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt2[ gIdx + offset] = make_int2( value.x, value.y );\n"
+	"	}\n"
+	"}\n"
+	"__kernel\n"
+	"__attribute__((reqd_work_group_size(64,1,1)))\n"
+	"void FillInt4Kernel(__global int4* dstInt4, 		const int num, const int4 value, const int offset)\n"
+	"{\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	if( gIdx < num )\n"
+	"	{\n"
+	"		dstInt4[ offset+gIdx ] = value;\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsCL.h
@@ -1,129 +1,128 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"// takahiro end\n"
-"#define WG_SIZE 128 \n"
-"#define m_numElems x\n"
-"#define m_numBlocks y\n"
-"#define m_numScanBlocks z\n"
-"/*typedef struct\n"
-"{\n"
-"	uint m_numElems;\n"
-"	uint m_numBlocks;\n"
-"	uint m_numScanBlocks;\n"
-"	uint m_padding[1];\n"
-"} ConstBuffer;\n"
-"*/\n"
-"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
-"{\n"
-"	u32 blocksum;\n"
-"    int offset = 1;\n"
-"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            data[bi] += data[ai];\n"
-"        }\n"
-"	}\n"
-"    GROUP_LDS_BARRIER;\n"
-"    if( lIdx == 0 )\n"
-"	{\n"
-"		blocksum = data[ n-1 ];\n"
-"        data[ n-1 ] = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	offset >>= 1;\n"
-"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            u32 temp = data[ai];\n"
-"            data[ai] = data[bi];\n"
-"            data[bi] += temp;\n"
-"        }\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	return blocksum;\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
-"		uint4 cb)\n"
-"{\n"
-"	__local u32 ldsData[WG_SIZE*2];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
-"	if( (2*gIdx) < cb.m_numElems )\n"
-"    {\n"
-"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-"	}\n"
-"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-"	{\n"
-"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-"    }\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
-"{\n"
-"	const u32 blockSize = WG_SIZE*2;\n"
-"	int myIdx = GET_GROUP_IDX+1;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	u32 iBlockSum = blockSum[myIdx];\n"
-"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-"	{\n"
-"		dst[i] += iBlockSum;\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
-"{\n"
-"	__local u32 ldsData[2048];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int lSize = GET_GROUP_SIZE;\n"
-"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-"	{\n"
-"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-"	{\n"
-"		dst[i] = ldsData[i];\n"
-"	}\n"
-"	if( gIdx == 0 )\n"
-"	{\n"
-"		dst[cb.m_numBlocks] = sum;\n"
-"	}\n"
-"}\n"
-;
+static const char* prefixScanKernelsCL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"// takahiro end\n"
+	"#define WG_SIZE 128 \n"
+	"#define m_numElems x\n"
+	"#define m_numBlocks y\n"
+	"#define m_numScanBlocks z\n"
+	"/*typedef struct\n"
+	"{\n"
+	"	uint m_numElems;\n"
+	"	uint m_numBlocks;\n"
+	"	uint m_numScanBlocks;\n"
+	"	uint m_padding[1];\n"
+	"} ConstBuffer;\n"
+	"*/\n"
+	"u32 ScanExclusive(__local u32* data, u32 n, int lIdx, int lSize)\n"
+	"{\n"
+	"	u32 blocksum;\n"
+	"    int offset = 1;\n"
+	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            data[bi] += data[ai];\n"
+	"        }\n"
+	"	}\n"
+	"    GROUP_LDS_BARRIER;\n"
+	"    if( lIdx == 0 )\n"
+	"	{\n"
+	"		blocksum = data[ n-1 ];\n"
+	"        data[ n-1 ] = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	offset >>= 1;\n"
+	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            u32 temp = data[ai];\n"
+	"            data[ai] = data[bi];\n"
+	"            data[bi] += temp;\n"
+	"        }\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	return blocksum;\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void LocalScanKernel(__global u32* dst, __global u32 *src, __global u32 *sumBuffer,\n"
+	"		uint4 cb)\n"
+	"{\n"
+	"	__local u32 ldsData[WG_SIZE*2];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+	"	u32 sum = ScanExclusive(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	if( lIdx == 0 ) sumBuffer[GET_GROUP_IDX] = sum;\n"
+	"	if( (2*gIdx) < cb.m_numElems )\n"
+	"    {\n"
+	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+	"	}\n"
+	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+	"	{\n"
+	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+	"    }\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void AddOffsetKernel(__global u32 *dst, __global u32 *blockSum, uint4 cb)\n"
+	"{\n"
+	"	const u32 blockSize = WG_SIZE*2;\n"
+	"	int myIdx = GET_GROUP_IDX+1;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	u32 iBlockSum = blockSum[myIdx];\n"
+	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+	"	{\n"
+	"		dst[i] += iBlockSum;\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void TopLevelScanKernel(__global u32* dst, uint4 cb)\n"
+	"{\n"
+	"	__local u32 ldsData[2048];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int lSize = GET_GROUP_SIZE;\n"
+	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+	"	{\n"
+	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	u32 sum = ScanExclusive(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+	"	{\n"
+	"		dst[i] = ldsData[i];\n"
+	"	}\n"
+	"	if( gIdx == 0 )\n"
+	"	{\n"
+	"		dst[cb.m_numBlocks] = sum;\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/PrefixScanKernelsFloat4CL.h
@@ -1,129 +1,128 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* prefixScanKernelsFloat4CL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Takahiro Harada\n"
-"typedef unsigned int u32;\n"
-"#define GET_GROUP_IDX get_group_id(0)\n"
-"#define GET_LOCAL_IDX get_local_id(0)\n"
-"#define GET_GLOBAL_IDX get_global_id(0)\n"
-"#define GET_GROUP_SIZE get_local_size(0)\n"
-"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
-"// takahiro end\n"
-"#define WG_SIZE 128 \n"
-"#define m_numElems x\n"
-"#define m_numBlocks y\n"
-"#define m_numScanBlocks z\n"
-"/*typedef struct\n"
-"{\n"
-"	uint m_numElems;\n"
-"	uint m_numBlocks;\n"
-"	uint m_numScanBlocks;\n"
-"	uint m_padding[1];\n"
-"} ConstBuffer;\n"
-"*/\n"
-"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
-"{\n"
-"	float4 blocksum;\n"
-"    int offset = 1;\n"
-"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            data[bi] += data[ai];\n"
-"        }\n"
-"	}\n"
-"    GROUP_LDS_BARRIER;\n"
-"    if( lIdx == 0 )\n"
-"	{\n"
-"		blocksum = data[ n-1 ];\n"
-"    data[ n-1 ] = 0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	offset >>= 1;\n"
-"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
-"    {\n"
-"        GROUP_LDS_BARRIER;\n"
-"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
-"        {\n"
-"            int ai = offset*(2*iIdx+1)-1;\n"
-"            int bi = offset*(2*iIdx+2)-1;\n"
-"            float4 temp = data[ai];\n"
-"            data[ai] = data[bi];\n"
-"            data[bi] += temp;\n"
-"        }\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	return blocksum;\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
-"{\n"
-"	__local float4 ldsData[WG_SIZE*2];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
-"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
-"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	if( lIdx == 0 ) \n"
-"		sumBuffer[GET_GROUP_IDX] = sum;\n"
-"	if( (2*gIdx) < cb.m_numElems )\n"
-"    {\n"
-"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
-"	}\n"
-"	if( (2*gIdx + 1) < cb.m_numElems )\n"
-"	{\n"
-"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
-"    }\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
-"{\n"
-"	const u32 blockSize = WG_SIZE*2;\n"
-"	int myIdx = GET_GROUP_IDX+1;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	float4 iBlockSum = blockSum[myIdx];\n"
-"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
-"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
-"	{\n"
-"		dst[i] += iBlockSum;\n"
-"	}\n"
-"}\n"
-"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
-"__kernel\n"
-"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
-"{\n"
-"	__local float4 ldsData[2048];\n"
-"	int gIdx = GET_GLOBAL_IDX;\n"
-"	int lIdx = GET_LOCAL_IDX;\n"
-"	int lSize = GET_GROUP_SIZE;\n"
-"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
-"	{\n"
-"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
-"	}\n"
-"	GROUP_LDS_BARRIER;\n"
-"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
-"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
-"	{\n"
-"		dst[i] = ldsData[i];\n"
-"	}\n"
-"	if( gIdx == 0 )\n"
-"	{\n"
-"		dst[cb.m_numBlocks] = sum;\n"
-"	}\n"
-"}\n"
-;
+static const char* prefixScanKernelsFloat4CL =
+	"/*\n"
+	"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+	"This software is provided 'as-is', without any express or implied warranty.\n"
+	"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+	"Permission is granted to anyone to use this software for any purpose, \n"
+	"including commercial applications, and to alter it and redistribute it freely, \n"
+	"subject to the following restrictions:\n"
+	"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+	"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+	"3. This notice may not be removed or altered from any source distribution.\n"
+	"*/\n"
+	"//Originally written by Takahiro Harada\n"
+	"typedef unsigned int u32;\n"
+	"#define GET_GROUP_IDX get_group_id(0)\n"
+	"#define GET_LOCAL_IDX get_local_id(0)\n"
+	"#define GET_GLOBAL_IDX get_global_id(0)\n"
+	"#define GET_GROUP_SIZE get_local_size(0)\n"
+	"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+	"// takahiro end\n"
+	"#define WG_SIZE 128 \n"
+	"#define m_numElems x\n"
+	"#define m_numBlocks y\n"
+	"#define m_numScanBlocks z\n"
+	"/*typedef struct\n"
+	"{\n"
+	"	uint m_numElems;\n"
+	"	uint m_numBlocks;\n"
+	"	uint m_numScanBlocks;\n"
+	"	uint m_padding[1];\n"
+	"} ConstBuffer;\n"
+	"*/\n"
+	"float4 ScanExclusiveFloat4(__local float4* data, u32 n, int lIdx, int lSize)\n"
+	"{\n"
+	"	float4 blocksum;\n"
+	"    int offset = 1;\n"
+	"    for(int nActive=n>>1; nActive>0; nActive>>=1, offset<<=1)\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for(int iIdx=lIdx; iIdx<nActive; iIdx+=lSize)\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            data[bi] += data[ai];\n"
+	"        }\n"
+	"	}\n"
+	"    GROUP_LDS_BARRIER;\n"
+	"    if( lIdx == 0 )\n"
+	"	{\n"
+	"		blocksum = data[ n-1 ];\n"
+	"    data[ n-1 ] = 0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	offset >>= 1;\n"
+	"    for(int nActive=1; nActive<n; nActive<<=1, offset>>=1 )\n"
+	"    {\n"
+	"        GROUP_LDS_BARRIER;\n"
+	"        for( int iIdx = lIdx; iIdx<nActive; iIdx += lSize )\n"
+	"        {\n"
+	"            int ai = offset*(2*iIdx+1)-1;\n"
+	"            int bi = offset*(2*iIdx+2)-1;\n"
+	"            float4 temp = data[ai];\n"
+	"            data[ai] = data[bi];\n"
+	"            data[bi] += temp;\n"
+	"        }\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	return blocksum;\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void LocalScanKernel(__global float4* dst, __global float4* src, __global float4* sumBuffer,	uint4 cb)\n"
+	"{\n"
+	"	__local float4 ldsData[WG_SIZE*2];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	ldsData[2*lIdx]     = ( 2*gIdx < cb.m_numElems )? src[2*gIdx]: 0;\n"
+	"	ldsData[2*lIdx + 1] = ( 2*gIdx+1 < cb.m_numElems )? src[2*gIdx + 1]: 0;\n"
+	"	float4 sum = ScanExclusiveFloat4(ldsData, WG_SIZE*2, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	if( lIdx == 0 ) \n"
+	"		sumBuffer[GET_GROUP_IDX] = sum;\n"
+	"	if( (2*gIdx) < cb.m_numElems )\n"
+	"    {\n"
+	"        dst[2*gIdx]     = ldsData[2*lIdx];\n"
+	"	}\n"
+	"	if( (2*gIdx + 1) < cb.m_numElems )\n"
+	"	{\n"
+	"        dst[2*gIdx + 1] = ldsData[2*lIdx + 1];\n"
+	"    }\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void AddOffsetKernel(__global float4* dst, __global float4* blockSum, uint4 cb)\n"
+	"{\n"
+	"	const u32 blockSize = WG_SIZE*2;\n"
+	"	int myIdx = GET_GROUP_IDX+1;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	float4 iBlockSum = blockSum[myIdx];\n"
+	"	int endValue = min((myIdx+1)*(blockSize), cb.m_numElems);\n"
+	"	for(int i=myIdx*blockSize+lIdx; i<endValue; i+=GET_GROUP_SIZE)\n"
+	"	{\n"
+	"		dst[i] += iBlockSum;\n"
+	"	}\n"
+	"}\n"
+	"__attribute__((reqd_work_group_size(WG_SIZE,1,1)))\n"
+	"__kernel\n"
+	"void TopLevelScanKernel(__global float4* dst, uint4 cb)\n"
+	"{\n"
+	"	__local float4 ldsData[2048];\n"
+	"	int gIdx = GET_GLOBAL_IDX;\n"
+	"	int lIdx = GET_LOCAL_IDX;\n"
+	"	int lSize = GET_GROUP_SIZE;\n"
+	"	for(int i=lIdx; i<cb.m_numScanBlocks; i+=lSize )\n"
+	"	{\n"
+	"		ldsData[i] = (i<cb.m_numBlocks)? dst[i]:0;\n"
+	"	}\n"
+	"	GROUP_LDS_BARRIER;\n"
+	"	float4 sum = ScanExclusiveFloat4(ldsData, cb.m_numScanBlocks, GET_LOCAL_IDX, GET_GROUP_SIZE);\n"
+	"	for(int i=lIdx; i<cb.m_numBlocks; i+=lSize )\n"
+	"	{\n"
+	"		dst[i] = ldsData[i];\n"
+	"	}\n"
+	"	if( gIdx == 0 )\n"
+	"	{\n"
+	"		dst[cb.m_numBlocks] = sum;\n"
+	"	}\n"
+	"}\n";
--- a/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/kernels/RadixSort32KernelsCL.h