diff --git a/build/premake4.lua b/build/premake4.lua
index 78bdeb312..8105683aa 100644
--- a/build/premake4.lua
+++ b/build/premake4.lua
@@ -92,6 +92,7 @@
 		include "../opencl/lds_bank_conflict"
 		include "../opencl/reduce"
 		include "../opencl/gpu_broadphase/test"
+		include "../opencl/gpu_sat/test"
 		
 		
 	end
\ No newline at end of file
diff --git a/build/stringify.bat b/build/stringify.bat
index 820661d04..0c04729f6 100644
--- a/build/stringify.bat
+++ b/build/stringify.bat
@@ -11,6 +11,9 @@ premake4 --file=stringifyKernel.lua --kernelfile="../opencl/parallel_primitives/
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sap.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapKernels.h" --stringname="sapCL" stringify
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kernels/sapFast.cl" --headerfile="../opencl/gpu_broadphase/kernels/sapFastKernels.h" --stringname="sapFastCL" stringify
 
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/sat.cl" --headerfile="../opencl/gpu_sat/kernels/satKernels.h" --stringname="satKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/satClipHullContacts.cl" --headerfile="../opencl/gpu_sat/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
+
 
 
 pause
\ No newline at end of file
diff --git a/opencl/gpu_sat/host/ConvexHullContact.cpp b/opencl/gpu_sat/host/ConvexHullContact.cpp
new file mode 100644
index 000000000..6bdf260b1
--- /dev/null
+++ b/opencl/gpu_sat/host/ConvexHullContact.cpp
@@ -0,0 +1,581 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+///This file was written by Erwin Coumans
+///Separating axis rest based on work from Pierre Terdiman, see
+///And contact clipping based on work from Simon Hobbs
+
+//#define BT_DEBUG_SAT_FACE
+
+#include "ConvexHullContact.h"
+#include <string.h>//memcpy
+#include "btConvexPolyhedronCL.h"
+
+typedef btAlignedObjectArray<btVector3> btVertexArray;
+#include "parallel_primitives/host/btQuickprof.h"
+
+#include <float.h> //for FLT_MAX
+#include "../basic_initialize/btOpenCLUtils.h"
+#include "parallel_primitives/host/btLauncherCL.h"
+//#include "AdlQuaternion.h"
+
+#include "../kernels/satKernels.h"
+#include "../kernels/satClipHullContacts.h"
+#include "parallel_primitives/host/btAabbUtil2.h"
+
+#define dot3F4 btDot
+
+GpuSatCollision::GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q )
+:m_context(ctx),
+m_device(device),
+m_queue(q),
+m_findSeparatingAxisKernel(0),
+m_totalContactsOut(m_context, m_queue)
+{
+	m_totalContactsOut.push_back(0);
+	
+	cl_int errNum=0;
+
+	if (1)
+	{
+		const char* src = satKernelsCL;
+		cl_program satProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,src,&errNum,"","opencl/gpu_rigidbody_pipeline2/sat.cl");
+		btAssert(errNum==CL_SUCCESS);
+
+		m_findSeparatingAxisKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,src, "findSeparatingAxisKernel",&errNum,satProg );
+
+
+		m_findCompoundPairsKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,src, "findCompoundPairsKernel",&errNum,satProg );
+	
+		m_processCompoundPairsKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,src, "processCompoundPairsKernel",&errNum,satProg );
+		btAssert(errNum==CL_SUCCESS);
+	}
+
+	if (1)
+	{
+		const char* srcClip = satClipKernelsCL;
+		cl_program satClipContactsProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,srcClip,&errNum,"","opencl/gpu_rigidbody_pipeline2/satClipHullContacts.cl");
+		btAssert(errNum==CL_SUCCESS);
+
+		m_clipHullHullKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullKernel",&errNum,satClipContactsProg);
+		btAssert(errNum==CL_SUCCESS);
+
+		m_clipCompoundsHullHullKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipCompoundsHullHullKernel",&errNum,satClipContactsProg);
+		btAssert(errNum==CL_SUCCESS);
+		
+
+        m_findClippingFacesKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "findClippingFacesKernel",&errNum,satClipContactsProg);
+		btAssert(errNum==CL_SUCCESS);
+
+        m_clipFacesAndContactReductionKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipFacesAndContactReductionKernel",&errNum,satClipContactsProg);
+		btAssert(errNum==CL_SUCCESS);        
+
+		m_clipHullHullConcaveConvexKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "clipHullHullConcaveConvexKernel",&errNum,satClipContactsProg);
+		btAssert(errNum==CL_SUCCESS);
+
+		m_extractManifoldAndAddContactKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip, "extractManifoldAndAddContactKernel",&errNum,satClipContactsProg);
+		btAssert(errNum==CL_SUCCESS);
+
+        m_newContactReductionKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip,
+                            "newContactReductionKernel",&errNum,satClipContactsProg);
+		btAssert(errNum==CL_SUCCESS);
+
+        
+        
+
+	} else
+	{
+		m_clipHullHullKernel=0;
+		m_clipCompoundsHullHullKernel = 0;
+        m_findClippingFacesKernel = 0;
+        m_newContactReductionKernel=0;
+        m_clipFacesAndContactReductionKernel = 0;
+		m_clipHullHullConcaveConvexKernel = 0;
+		m_extractManifoldAndAddContactKernel = 0;
+	}
+	
+
+}
+
+GpuSatCollision::~GpuSatCollision()
+{
+	if (m_findSeparatingAxisKernel)
+		clReleaseKernel(m_findSeparatingAxisKernel);
+
+	if (m_findCompoundPairsKernel)
+		clReleaseKernel(m_findCompoundPairsKernel);
+
+	if (m_processCompoundPairsKernel)
+		clReleaseKernel(m_processCompoundPairsKernel);
+    
+    if (m_findClippingFacesKernel)
+        clReleaseKernel(m_findClippingFacesKernel);
+   
+    if (m_clipFacesAndContactReductionKernel)
+        clReleaseKernel(m_clipFacesAndContactReductionKernel);
+    if (m_newContactReductionKernel)
+        clReleaseKernel(m_newContactReductionKernel);
+    
+	if (m_clipHullHullKernel)
+		clReleaseKernel(m_clipHullHullKernel);
+	if (m_clipCompoundsHullHullKernel)
+		clReleaseKernel(m_clipCompoundsHullHullKernel);
+
+	if (m_clipHullHullConcaveConvexKernel)
+		clReleaseKernel(m_clipHullHullConcaveConvexKernel);
+	if (m_extractManifoldAndAddContactKernel)
+		clReleaseKernel(m_extractManifoldAndAddContactKernel);
+}
+
+
+
+void GpuSatCollision::computeConvexConvexContactsGPUSAT( const btOpenCLArray<btInt2>* pairs, int nPairs,
+			const btOpenCLArray<btRigidBodyCL>* bodyBuf,
+			btOpenCLArray<btContact4>* contactOut, int& nContacts,
+														
+			const btOpenCLArray<btConvexPolyhedronCL>& convexData,
+			const btOpenCLArray<btVector3>& gpuVertices,
+			const btOpenCLArray<btVector3>& gpuUniqueEdges,
+			const btOpenCLArray<btGpuFace>& gpuFaces,
+			const btOpenCLArray<int>& gpuIndices,
+			const btOpenCLArray<btCollidable>& gpuCollidables,
+			const btOpenCLArray<btGpuChildShape>& gpuChildShapes,
+
+			const btOpenCLArray<btYetAnotherAabb>& clAabbs,
+            btOpenCLArray<btVector3>& worldVertsB1GPU,
+            btOpenCLArray<btInt4>& clippingFacesOutGPU,
+            btOpenCLArray<btVector3>& worldNormalsAGPU,
+            btOpenCLArray<btVector3>& worldVertsA1GPU,
+            btOpenCLArray<btVector3>& worldVertsB2GPU,     
+			int numObjects,
+			int maxTriConvexPairCapacity,
+			btOpenCLArray<btInt4>& triangleConvexPairsOut,
+			int& numTriConvexPairsOut
+			)
+{
+	if (!nPairs)
+		return;
+
+	BT_PROFILE("computeConvexConvexContactsGPUSAT");
+   // printf("nContacts = %d\n",nContacts);
+    
+	btOpenCLArray<btVector3> sepNormals(m_context,m_queue);
+	sepNormals.resize(nPairs);
+	btOpenCLArray<int> hasSeparatingNormals(m_context,m_queue);
+	hasSeparatingNormals.resize(nPairs);
+	
+	int concaveCapacity=maxTriConvexPairCapacity;
+	btOpenCLArray<btVector3> concaveSepNormals(m_context,m_queue);
+	concaveSepNormals.resize(concaveCapacity);
+
+	btOpenCLArray<int> numConcavePairsOut(m_context,m_queue);
+	numConcavePairsOut.push_back(0);
+
+	int compoundPairCapacity=65536*10;
+	btOpenCLArray<btCompoundOverlappingPair> gpuCompoundPairs(m_context,m_queue);
+	gpuCompoundPairs.resize(compoundPairCapacity);
+
+	btOpenCLArray<btVector3> gpuCompoundSepNormals(m_context,m_queue);
+	gpuCompoundSepNormals.resize(compoundPairCapacity);
+	
+	
+	btOpenCLArray<int> gpuHasCompoundSepNormals(m_context,m_queue);
+	gpuHasCompoundSepNormals.resize(compoundPairCapacity);
+	
+	btOpenCLArray<int> numCompoundPairsOut(m_context,m_queue);
+	numCompoundPairsOut.push_back(0);
+
+	int numCompoundPairs = 0;
+
+	bool findSeparatingAxisOnGpu = true;//false;
+	int numConcave =0;
+
+	{
+		clFinish(m_queue);
+		if (findSeparatingAxisOnGpu)
+		{
+	
+		
+			BT_PROFILE("findSeparatingAxisKernel");
+			btBufferInfoCL bInfo[] = { 
+				btBufferInfoCL( pairs->getBufferCL(), true ), 
+				btBufferInfoCL( bodyBuf->getBufferCL(),true), 
+				btBufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				btBufferInfoCL( convexData.getBufferCL(),true),
+				btBufferInfoCL( gpuVertices.getBufferCL(),true),
+				btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				btBufferInfoCL( gpuFaces.getBufferCL(),true),
+				btBufferInfoCL( gpuIndices.getBufferCL(),true),
+				btBufferInfoCL( clAabbs.getBufferCL(),true),
+				btBufferInfoCL( sepNormals.getBufferCL()),
+				btBufferInfoCL( hasSeparatingNormals.getBufferCL()),
+				btBufferInfoCL( triangleConvexPairsOut.getBufferCL()),
+				btBufferInfoCL( concaveSepNormals.getBufferCL()),
+				btBufferInfoCL( numConcavePairsOut.getBufferCL())
+			};
+
+			btLauncherCL launcher(m_queue, m_findSeparatingAxisKernel);
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( nPairs  );
+			launcher.setConst( maxTriConvexPairCapacity);
+
+			int num = nPairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+
+			numConcave = numConcavePairsOut.at(0);
+			if (numConcave > maxTriConvexPairCapacity)
+				numConcave = maxTriConvexPairCapacity;
+			triangleConvexPairsOut.resize(numConcave);
+			
+
+
+
+			{
+				BT_PROFILE("findCompoundPairsKernel");
+				btBufferInfoCL bInfo[] = 
+				{ 
+					btBufferInfoCL( pairs->getBufferCL(), true ), 
+					btBufferInfoCL( bodyBuf->getBufferCL(),true), 
+					btBufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					btBufferInfoCL( convexData.getBufferCL(),true),
+					btBufferInfoCL( gpuVertices.getBufferCL(),true),
+					btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					btBufferInfoCL( gpuFaces.getBufferCL(),true),
+					btBufferInfoCL( gpuIndices.getBufferCL(),true),
+					btBufferInfoCL( clAabbs.getBufferCL(),true),
+					btBufferInfoCL( gpuChildShapes.getBufferCL(),true),
+					btBufferInfoCL( gpuCompoundPairs.getBufferCL()),
+					btBufferInfoCL( numCompoundPairsOut.getBufferCL())
+				};
+
+				btLauncherCL launcher(m_queue, m_findCompoundPairsKernel);
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+				launcher.setConst( nPairs  );
+				launcher.setConst( compoundPairCapacity);
+
+				int num = nPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+			}
+
+
+			numCompoundPairs = numCompoundPairsOut.at(0);
+			//printf("numCompoundPairs =%d\n",numCompoundPairs );
+			if (numCompoundPairs > compoundPairCapacity)
+				numCompoundPairs = compoundPairCapacity;
+
+			gpuCompoundPairs.resize(numCompoundPairs);
+			gpuHasCompoundSepNormals.resize(numCompoundPairs);
+			gpuCompoundSepNormals.resize(numCompoundPairs);
+			
+
+			if (numCompoundPairs)
+			{
+
+				BT_PROFILE("processCompoundPairsKernel");
+				btBufferInfoCL bInfo[] = 
+				{ 
+					btBufferInfoCL( gpuCompoundPairs.getBufferCL(), true ), 
+					btBufferInfoCL( bodyBuf->getBufferCL(),true), 
+					btBufferInfoCL( gpuCollidables.getBufferCL(),true), 
+					btBufferInfoCL( convexData.getBufferCL(),true),
+					btBufferInfoCL( gpuVertices.getBufferCL(),true),
+					btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+					btBufferInfoCL( gpuFaces.getBufferCL(),true),
+					btBufferInfoCL( gpuIndices.getBufferCL(),true),
+					btBufferInfoCL( clAabbs.getBufferCL(),true),
+					btBufferInfoCL( gpuChildShapes.getBufferCL(),true),
+					btBufferInfoCL( gpuCompoundSepNormals.getBufferCL()),
+					btBufferInfoCL( gpuHasCompoundSepNormals.getBufferCL())
+				};
+
+				btLauncherCL launcher(m_queue, m_processCompoundPairsKernel);
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+				launcher.setConst( numCompoundPairs  );
+
+				int num = numCompoundPairs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+			
+			}
+
+
+			//printf("numConcave  = %d\n",numConcave);
+
+		}//if (findSeparatingAxisOnGpu)
+
+
+//		printf("hostNormals.size()=%d\n",hostNormals.size());
+		//int numPairs = pairCount.at(0);
+		
+		
+		
+	}
+#ifdef __APPLE__
+	bool contactClippingOnGpu = true;
+#else
+ bool contactClippingOnGpu = true;
+#endif
+	
+	if (contactClippingOnGpu)
+	{
+		//BT_PROFILE("clipHullHullKernel");
+
+		
+		m_totalContactsOut.copyFromHostPointer(&nContacts,1,0,true);
+
+		//concave-convex contact clipping
+
+		if (numConcave)
+		{
+			BT_PROFILE("clipHullHullConcaveConvexKernel");
+			nContacts = m_totalContactsOut.at(0);
+			btBufferInfoCL bInfo[] = { 
+				btBufferInfoCL( triangleConvexPairsOut.getBufferCL(), true ), 
+				btBufferInfoCL( bodyBuf->getBufferCL(),true), 
+				btBufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				btBufferInfoCL( convexData.getBufferCL(),true),
+				btBufferInfoCL( gpuVertices.getBufferCL(),true),
+				btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				btBufferInfoCL( gpuFaces.getBufferCL(),true),
+				btBufferInfoCL( gpuIndices.getBufferCL(),true),
+				btBufferInfoCL( concaveSepNormals.getBufferCL()),
+				btBufferInfoCL( contactOut->getBufferCL()),
+				btBufferInfoCL( m_totalContactsOut.getBufferCL())	
+			};
+			btLauncherCL launcher(m_queue, m_clipHullHullConcaveConvexKernel);
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( numConcave  );
+			int num = numConcave;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+			nContacts = m_totalContactsOut.at(0);
+		}
+
+
+		//convex-convex contact clipping
+        if (1)
+		{
+			BT_PROFILE("clipHullHullKernel");
+			bool breakupKernel = false;
+
+#ifdef __APPLE__
+			breakupKernel = true;
+#endif
+
+			if (breakupKernel)
+			{
+
+
+			
+            int vertexFaceCapacity = 64;
+            
+            
+            worldVertsB1GPU.resize(vertexFaceCapacity*nPairs);
+            
+            
+            clippingFacesOutGPU.resize(nPairs);
+            
+            
+            worldNormalsAGPU.resize(nPairs);
+            
+            
+            worldVertsA1GPU.resize(vertexFaceCapacity*nPairs);
+            
+             
+            worldVertsB2GPU.resize(vertexFaceCapacity*nPairs);
+        
+            
+            
+            {
+				BT_PROFILE("findClippingFacesKernel");
+            btBufferInfoCL bInfo[] = {
+                btBufferInfoCL( pairs->getBufferCL(), true ),
+                btBufferInfoCL( bodyBuf->getBufferCL(),true),
+                btBufferInfoCL( gpuCollidables.getBufferCL(),true),
+                btBufferInfoCL( convexData.getBufferCL(),true),
+                btBufferInfoCL( gpuVertices.getBufferCL(),true),
+                btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+                btBufferInfoCL( gpuFaces.getBufferCL(),true), 
+                btBufferInfoCL( gpuIndices.getBufferCL(),true),
+                btBufferInfoCL( sepNormals.getBufferCL()),
+                btBufferInfoCL( hasSeparatingNormals.getBufferCL()),
+                btBufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                btBufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                btBufferInfoCL( worldNormalsAGPU.getBufferCL()),
+                btBufferInfoCL( worldVertsB1GPU.getBufferCL())
+            };
+            
+            btLauncherCL launcher(m_queue, m_findClippingFacesKernel);
+            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+            launcher.setConst( vertexFaceCapacity);
+            launcher.setConst( nPairs  );
+            int num = nPairs;
+            launcher.launch1D( num);
+            clFinish(m_queue);
+
+            }
+            
+  
+          
+            
+
+            ///clip face B against face A, reduce contacts and append them to a global contact array
+            if (1)
+            {
+				BT_PROFILE("clipFacesAndContactReductionKernel");
+				//nContacts = m_totalContactsOut.at(0);
+				//int h = hasSeparatingNormals.at(0);
+				//int4 p = clippingFacesOutGPU.at(0);
+                btBufferInfoCL bInfo[] = {
+                    btBufferInfoCL( pairs->getBufferCL(), true ),
+                    btBufferInfoCL( bodyBuf->getBufferCL(),true),
+                    btBufferInfoCL( sepNormals.getBufferCL()),
+                    btBufferInfoCL( hasSeparatingNormals.getBufferCL()),
+					btBufferInfoCL( contactOut->getBufferCL()),
+                    btBufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                    btBufferInfoCL( worldVertsA1GPU.getBufferCL()),
+                    btBufferInfoCL( worldNormalsAGPU.getBufferCL()),
+                    btBufferInfoCL( worldVertsB1GPU.getBufferCL()),
+                    btBufferInfoCL( worldVertsB2GPU.getBufferCL()),
+					btBufferInfoCL( m_totalContactsOut.getBufferCL())
+                };
+                
+                btLauncherCL launcher(m_queue, m_clipFacesAndContactReductionKernel);
+                launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+                launcher.setConst(vertexFaceCapacity);
+
+				launcher.setConst( nPairs  );
+                int debugMode = 0;
+				launcher.setConst( debugMode);
+
+				/*
+				int serializationBytes = launcher.getSerializationBufferSize();
+				unsigned char* buf = (unsigned char*)malloc(serializationBytes+1);
+				int actualWritten = launcher.serializeArguments(buf,serializationBytes+1);
+				FILE* f = fopen("clipFacesAndContactReductionKernel.bin","wb");
+				fwrite(buf,actualWritten,1,f);
+				fclose(f);
+				free(buf);
+				printf("serializationBytes=%d, actualWritten=%d\n",serializationBytes,actualWritten);
+				*/
+
+                int num = nPairs;
+
+                launcher.launch1D( num);
+                clFinish(m_queue);
+                {
+//                    nContacts = m_totalContactsOut.at(0);
+  //                  printf("nContacts = %d\n",nContacts);
+                    
+                    contactOut->reserve(nContacts+nPairs);
+                    
+                    {
+                        BT_PROFILE("newContactReductionKernel");
+                            btBufferInfoCL bInfo[] =
+                        {
+                            btBufferInfoCL( pairs->getBufferCL(), true ),
+                            btBufferInfoCL( bodyBuf->getBufferCL(),true),
+                            btBufferInfoCL( sepNormals.getBufferCL()),
+                            btBufferInfoCL( hasSeparatingNormals.getBufferCL()),
+                            btBufferInfoCL( contactOut->getBufferCL()),
+                            btBufferInfoCL( clippingFacesOutGPU.getBufferCL()),
+                            btBufferInfoCL( worldVertsB2GPU.getBufferCL()),
+                            btBufferInfoCL( m_totalContactsOut.getBufferCL())
+                        };
+                        
+                        btLauncherCL launcher(m_queue, m_newContactReductionKernel);
+                        launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+                        launcher.setConst(vertexFaceCapacity);
+                        launcher.setConst( nPairs  );
+                        int num = nPairs;
+                        
+                        launcher.launch1D( num);
+                    }
+                    nContacts = m_totalContactsOut.at(0);
+                    contactOut->resize(nContacts);
+                    
+//                    Contact4 pt = contactOut->at(0);
+                    
+  //                  printf("nContacts = %d\n",nContacts);
+                }
+            }
+	}            
+	else
+	{
+	 
+		if (nPairs)
+		{
+			btBufferInfoCL bInfo[] = {
+				btBufferInfoCL( pairs->getBufferCL(), true ), 
+				btBufferInfoCL( bodyBuf->getBufferCL(),true), 
+				btBufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				btBufferInfoCL( convexData.getBufferCL(),true),
+				btBufferInfoCL( gpuVertices.getBufferCL(),true),
+				btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				btBufferInfoCL( gpuFaces.getBufferCL(),true),
+				btBufferInfoCL( gpuIndices.getBufferCL(),true),
+				btBufferInfoCL( sepNormals.getBufferCL()),
+				btBufferInfoCL( hasSeparatingNormals.getBufferCL()),
+				btBufferInfoCL( contactOut->getBufferCL()),
+				btBufferInfoCL( m_totalContactsOut.getBufferCL())	
+			};
+			btLauncherCL launcher(m_queue, m_clipHullHullKernel);
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( nPairs  );
+			int num = nPairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		
+			nContacts = m_totalContactsOut.at(0);
+			contactOut->resize(nContacts);
+		}
+
+		int nCompoundsPairs = gpuCompoundPairs.size();
+
+		if (nCompoundsPairs)
+		{
+				btBufferInfoCL bInfo[] = {
+				btBufferInfoCL( gpuCompoundPairs.getBufferCL(), true ), 
+				btBufferInfoCL( bodyBuf->getBufferCL(),true), 
+				btBufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				btBufferInfoCL( convexData.getBufferCL(),true),
+				btBufferInfoCL( gpuVertices.getBufferCL(),true),
+				btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				btBufferInfoCL( gpuFaces.getBufferCL(),true),
+				btBufferInfoCL( gpuIndices.getBufferCL(),true),
+				btBufferInfoCL( gpuChildShapes.getBufferCL(),true),
+				btBufferInfoCL( gpuCompoundSepNormals.getBufferCL(),true),
+				btBufferInfoCL( gpuHasCompoundSepNormals.getBufferCL(),true),
+				btBufferInfoCL( contactOut->getBufferCL()),
+				btBufferInfoCL( m_totalContactsOut.getBufferCL())	
+			};
+			btLauncherCL launcher(m_queue, m_clipCompoundsHullHullKernel);
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( nCompoundsPairs  );
+			int num = nCompoundsPairs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		
+			nContacts = m_totalContactsOut.at(0);
+			contactOut->resize(nContacts);
+		}
+		}
+		}
+
+	}
+}
diff --git a/opencl/gpu_sat/host/ConvexHullContact.h b/opencl/gpu_sat/host/ConvexHullContact.h
new file mode 100644
index 000000000..299a5d6ae
--- /dev/null
+++ b/opencl/gpu_sat/host/ConvexHullContact.h
@@ -0,0 +1,85 @@
+
+#ifndef _CONVEX_HULL_CONTACT_H
+#define _CONVEX_HULL_CONTACT_H
+
+#include "parallel_primitives/host/btOpenCLArray.h"
+#include "btRigidBodyCL.h"
+#include "parallel_primitives/host/btAlignedObjectArray.h"
+#include "btConvexUtility.h"
+#include "btConvexPolyhedronCL.h"
+#include "btCollidable.h"
+#include "btContact4.h"
+#include "parallel_primitives/host/btInt2.h"
+#include "parallel_primitives/host/btInt4.h"
+
+//#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
+
+
+struct btYetAnotherAabb
+{
+	union
+	{
+		float m_min[4];
+		int m_minIndices[4];
+	};
+	union
+	{
+		float m_max[4];
+		//int m_signedMaxIndices[4];
+		//unsigned int m_unsignedMaxIndices[4];
+	};
+};
+
+struct GpuSatCollision
+{
+	cl_context				m_context;
+	cl_device_id			m_device;
+	cl_command_queue		m_queue;
+	cl_kernel				m_findSeparatingAxisKernel;
+	cl_kernel				m_findCompoundPairsKernel;
+	cl_kernel				m_processCompoundPairsKernel;
+
+	cl_kernel				m_clipHullHullKernel;
+	cl_kernel				m_clipCompoundsHullHullKernel;
+    
+    cl_kernel               m_clipFacesAndContactReductionKernel;
+    cl_kernel               m_findClippingFacesKernel;
+    
+	cl_kernel				m_clipHullHullConcaveConvexKernel;
+	cl_kernel				m_extractManifoldAndAddContactKernel;
+    cl_kernel               m_newContactReductionKernel;
+    
+
+	btOpenCLArray<int>		m_totalContactsOut;
+
+	GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	virtual ~GpuSatCollision();
+	
+
+	void computeConvexConvexContactsGPUSAT( const btOpenCLArray<btInt2>* pairs, int nPairs, 
+			const btOpenCLArray<btRigidBodyCL>* bodyBuf,
+			btOpenCLArray<btContact4>* contactOut, int& nContacts,
+			const btOpenCLArray<btConvexPolyhedronCL>& hostConvexData,
+			const btOpenCLArray<btVector3>& vertices,
+			const btOpenCLArray<btVector3>& uniqueEdges,
+			const btOpenCLArray<btGpuFace>& faces,
+			const btOpenCLArray<int>& indices,
+			const btOpenCLArray<btCollidable>& gpuCollidables,
+			const btOpenCLArray<btGpuChildShape>& gpuChildShapes,
+
+			const btOpenCLArray<btYetAnotherAabb>& clAabbs,
+           btOpenCLArray<btVector3>& worldVertsB1GPU,
+           btOpenCLArray<btInt4>& clippingFacesOutGPU,
+           btOpenCLArray<btVector3>& worldNormalsAGPU,
+           btOpenCLArray<btVector3>& worldVertsA1GPU,
+           btOpenCLArray<btVector3>& worldVertsB2GPU,
+			int numObjects,
+			int maxTriConvexPairCapacity,
+			btOpenCLArray<btInt4>& triangleConvexPairs,
+			int& numTriConvexPairsOut
+			);
+
+
+};
+
+#endif //_CONVEX_HULL_CONTACT_H
diff --git a/opencl/gpu_sat/host/btCollidable.h b/opencl/gpu_sat/host/btCollidable.h
new file mode 100644
index 000000000..6209671de
--- /dev/null
+++ b/opencl/gpu_sat/host/btCollidable.h
@@ -0,0 +1,38 @@
+
+#ifndef BT_COLLIDABLE_H
+#define BT_COLLIDABLE_H
+
+struct btCollidable
+{
+	int m_numChildShapes;
+	float m_radius;
+	int m_shapeType;
+	int m_shapeIndex;
+};
+
+struct btCollidableNew
+{
+	short int m_shapeType;
+	short int m_numShapes;
+	int m_shapeIndex;
+};
+
+struct btGpuChildShape
+{
+	float	m_childPosition[4];
+	float	m_childOrientation[4];
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+};
+
+struct btCompoundOverlappingPair
+{
+	int m_bodyIndexA;
+	int m_bodyIndexB;
+//	int	m_pairType;
+	int m_childShapeIndexA;
+	int m_childShapeIndexB;
+};
+#endif //BT_COLLIDABLE_H
diff --git a/opencl/gpu_sat/host/btContact4.h b/opencl/gpu_sat/host/btContact4.h
new file mode 100644
index 000000000..a14f60a2c
--- /dev/null
+++ b/opencl/gpu_sat/host/btContact4.h
@@ -0,0 +1,42 @@
+#ifndef BT_CONTACT4_H
+#define BT_CONTACT4_H
+
+#include "parallel_primitives/host/btVector3.h"
+
+
+ATTRIBUTE_ALIGNED16(struct) btContact4
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	btVector3	m_worldPos[4];
+	btVector3	m_worldNormal;
+//	float m_restituitionCoeff;
+//	float m_frictionCoeff;
+	unsigned short  m_restituitionCoeffCmp;
+	unsigned short  m_frictionCoeffCmp;
+	int m_batchIdx;
+
+	int m_bodyAPtrAndSignBit;
+	int m_bodyBPtrAndSignBit;
+
+	int getBodyA()const {return abs(m_bodyAPtrAndSignBit);}
+	int getBodyB()const {return abs(m_bodyBPtrAndSignBit);}
+	bool isBodyAFixed()const { return m_bodyAPtrAndSignBit<0;}
+	bool isBodyBFixed()const { return m_bodyBPtrAndSignBit<0;}
+	//	todo. make it safer
+	int& getBatchIdx() { return m_batchIdx; }
+	const int& getBatchIdx() const { return m_batchIdx; }
+	float getRestituitionCoeff() const { return ((float)m_restituitionCoeffCmp/(float)0xffff); }
+	void setRestituitionCoeff( float c ) { btAssert( c >= 0.f && c <= 1.f ); m_restituitionCoeffCmp = (unsigned short)(c*0xffff); }
+	float getFrictionCoeff() const { return ((float)m_frictionCoeffCmp/(float)0xffff); }
+	void setFrictionCoeff( float c ) { btAssert( c >= 0.f && c <= 1.f ); m_frictionCoeffCmp = (unsigned short)(c*0xffff); }
+
+	float& getNPoints() { return m_worldNormal[3]; }
+	float getNPoints() const { return m_worldNormal[3]; }
+
+	float getPenetration(int idx) const { return m_worldPos[idx][3]; }
+
+	bool isInvalid() const { return (getBodyA()==0 || getBodyB()==0); }
+};
+
+#endif //BT_CONTACT4_H
diff --git a/opencl/gpu_sat/host/btConvexPolyhedronCL.h b/opencl/gpu_sat/host/btConvexPolyhedronCL.h
new file mode 100644
index 000000000..ee789f026
--- /dev/null
+++ b/opencl/gpu_sat/host/btConvexPolyhedronCL.h
@@ -0,0 +1,64 @@
+#ifndef CONVEX_POLYHEDRON_CL
+#define CONVEX_POLYHEDRON_CL
+
+#include "parallel_primitives/host/btTransform.h"
+
+struct btGpuFace
+{
+	btVector4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+};
+
+ATTRIBUTE_ALIGNED16(struct) btConvexPolyhedronCL
+{
+	btVector3		m_localCenter;
+	btVector3		m_extents;
+	btVector3		mC;
+	btVector3		mE;
+
+	btScalar		m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;	
+	
+
+
+	inline void project(const btTransform& trans, const btVector3& dir, const btAlignedObjectArray<btVector3>& vertices, btScalar& min, btScalar& max) const
+	{
+		min = FLT_MAX;
+		max = -FLT_MAX;
+		int numVerts = m_numVertices;
+
+		const btVector3 localDir = trans.getBasis().transpose()*dir;
+		const btVector3 localDi2 = quatRotate(trans.getRotation().inverse(),dir);
+		
+		btScalar offset = trans.getOrigin().dot(dir);
+
+		for(int i=0;i<numVerts;i++)
+		{
+			//btVector3 pt = trans * vertices[m_vertexOffset+i];
+			//btScalar dp = pt.dot(dir);
+			btScalar dp = vertices[m_vertexOffset+i].dot(localDir);
+			//btAssert(dp==dpL);
+			if(dp < min)	min = dp;
+			if(dp > max)	max = dp;
+		}
+		if(min>max)
+		{
+			btScalar tmp = min;
+			min = max;
+			max = tmp;
+		}
+		min += offset;
+		max += offset;
+	}
+
+};
+
+#endif //CONVEX_POLYHEDRON_CL
\ No newline at end of file
diff --git a/opencl/gpu_sat/host/btConvexUtility.cpp b/opencl/gpu_sat/host/btConvexUtility.cpp
new file mode 100644
index 000000000..a86bf3b1e
--- /dev/null
+++ b/opencl/gpu_sat/host/btConvexUtility.cpp
@@ -0,0 +1,513 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+#include "btConvexUtility.h"
+#include "LinearMath/btConvexHullComputer.h"
+#include "LinearMath/btGrahamScan2dConvexHull.h"
+#include "LinearMath/btQuaternion.h"
+#include "LinearMath/btHashMap.h"
+
+#include "../gpu_rigidbody_pipeline2/ConvexPolyhedronCL.h"
+
+
+
+btConvexUtility::~btConvexUtility()
+{
+}
+
+bool	btConvexUtility::initializePolyhedralFeatures(const btVector3* orgVertices, int numPoints, bool mergeCoplanarTriangles)
+{
+	
+	
+
+	btConvexHullComputer conv;
+	conv.compute(&orgVertices[0].getX(), sizeof(btVector3),numPoints,0.f,0.f);
+
+	btAlignedObjectArray<btVector3> faceNormals;
+	int numFaces = conv.faces.size();
+	faceNormals.resize(numFaces);
+	btConvexHullComputer* convexUtil = &conv;
+
+	
+	btAlignedObjectArray<btMyFace>	tmpFaces;
+	tmpFaces.resize(numFaces);
+
+	int numVertices = convexUtil->vertices.size();
+	m_vertices.resize(numVertices);
+	for (int p=0;p<numVertices;p++)
+	{
+		m_vertices[p] = convexUtil->vertices[p];
+	}
+
+
+	for (int i=0;i<numFaces;i++)
+	{
+		int face = convexUtil->faces[i];
+		//printf("face=%d\n",face);
+		const btConvexHullComputer::Edge*  firstEdge = &convexUtil->edges[face];
+		const btConvexHullComputer::Edge*  edge = firstEdge;
+
+		btVector3 edges[3];
+		int numEdges = 0;
+		//compute face normals
+
+		do
+		{
+			
+			int src = edge->getSourceVertex();
+			tmpFaces[i].m_indices.push_back(src);
+			int targ = edge->getTargetVertex();
+			btVector3 wa = convexUtil->vertices[src];
+
+			btVector3 wb = convexUtil->vertices[targ];
+			btVector3 newEdge = wb-wa;
+			newEdge.normalize();
+			if (numEdges<2)
+				edges[numEdges++] = newEdge;
+
+			edge = edge->getNextEdgeOfFace();
+		} while (edge!=firstEdge);
+
+		btScalar planeEq = 1e30f;
+
+		
+		if (numEdges==2)
+		{
+			faceNormals[i] = edges[0].cross(edges[1]);
+			faceNormals[i].normalize();
+			tmpFaces[i].m_plane[0] = faceNormals[i].getX();
+			tmpFaces[i].m_plane[1] = faceNormals[i].getY();
+			tmpFaces[i].m_plane[2] = faceNormals[i].getZ();
+			tmpFaces[i].m_plane[3] = planeEq;
+
+		}
+		else
+		{
+			btAssert(0);//degenerate?
+			faceNormals[i].setZero();
+		}
+
+		for (int v=0;v<tmpFaces[i].m_indices.size();v++)
+		{
+			btScalar eq = m_vertices[tmpFaces[i].m_indices[v]].dot(faceNormals[i]);
+			if (planeEq>eq)
+			{
+				planeEq=eq;
+			}
+		}
+		tmpFaces[i].m_plane[3] = -planeEq;
+	}
+
+	//merge coplanar faces and copy them to m_polyhedron
+
+	btScalar faceWeldThreshold= 0.999f;
+	btAlignedObjectArray<int> todoFaces;
+	for (int i=0;i<tmpFaces.size();i++)
+		todoFaces.push_back(i);
+
+	while (todoFaces.size())
+	{
+		btAlignedObjectArray<int> coplanarFaceGroup;
+		int refFace = todoFaces[todoFaces.size()-1];
+
+		coplanarFaceGroup.push_back(refFace);
+		btMyFace& faceA = tmpFaces[refFace];
+		todoFaces.pop_back();
+
+		btVector3 faceNormalA(faceA.m_plane[0],faceA.m_plane[1],faceA.m_plane[2]);
+		for (int j=todoFaces.size()-1;j>=0;j--)
+		{
+			int i = todoFaces[j];
+			btMyFace& faceB = tmpFaces[i];
+			btVector3 faceNormalB(faceB.m_plane[0],faceB.m_plane[1],faceB.m_plane[2]);
+			if (faceNormalA.dot(faceNormalB)>faceWeldThreshold)
+			{
+				coplanarFaceGroup.push_back(i);
+				todoFaces.remove(i);
+			}
+		}
+
+
+		bool did_merge = false;
+		if (coplanarFaceGroup.size()>1)
+		{
+			//do the merge: use Graham Scan 2d convex hull
+
+			btAlignedObjectArray<GrahamVector3> orgpoints;
+			btVector3 averageFaceNormal(0,0,0);
+
+			for (int i=0;i<coplanarFaceGroup.size();i++)
+			{
+//				m_polyhedron->m_faces.push_back(tmpFaces[coplanarFaceGroup[i]]);
+
+				btMyFace& face = tmpFaces[coplanarFaceGroup[i]];
+				btVector3 faceNormal(face.m_plane[0],face.m_plane[1],face.m_plane[2]);
+				averageFaceNormal+=faceNormal;
+				for (int f=0;f<face.m_indices.size();f++)
+				{
+					int orgIndex = face.m_indices[f];
+					btVector3 pt = m_vertices[orgIndex];
+					
+					bool found = false;
+
+					for (int i=0;i<orgpoints.size();i++)
+					{
+						//if ((orgpoints[i].m_orgIndex == orgIndex) || ((rotatedPt-orgpoints[i]).length2()<0.0001))
+						if (orgpoints[i].m_orgIndex == orgIndex)
+						{
+							found=true;
+							break;
+						}
+					}
+					if (!found)
+						orgpoints.push_back(GrahamVector3(pt,orgIndex));
+				}
+			}
+
+			
+
+			btMyFace combinedFace;
+			for (int i=0;i<4;i++)
+				combinedFace.m_plane[i] = tmpFaces[coplanarFaceGroup[0]].m_plane[i];
+
+			btAlignedObjectArray<GrahamVector3> hull;
+
+			averageFaceNormal.normalize();
+			GrahamScanConvexHull2D(orgpoints,hull,averageFaceNormal);
+
+			for (int i=0;i<hull.size();i++)
+			{
+				combinedFace.m_indices.push_back(hull[i].m_orgIndex);
+				for(int k = 0; k < orgpoints.size(); k++) 
+				{
+					if(orgpoints[k].m_orgIndex == hull[i].m_orgIndex) 
+					{
+						orgpoints[k].m_orgIndex = -1; // invalidate...
+						break;
+					}
+				}
+			}
+
+			// are there rejected vertices?
+			bool reject_merge = false;
+			
+
+
+			for(int i = 0; i < orgpoints.size(); i++) {
+				if(orgpoints[i].m_orgIndex == -1)
+					continue; // this is in the hull...
+				// this vertex is rejected -- is anybody else using this vertex?
+				for(int j = 0; j < tmpFaces.size(); j++) {
+					
+					btMyFace& face = tmpFaces[j];
+					// is this a face of the current coplanar group?
+					bool is_in_current_group = false;
+					for(int k = 0; k < coplanarFaceGroup.size(); k++) {
+						if(coplanarFaceGroup[k] == j) {
+							is_in_current_group = true;
+							break;
+						}
+					}
+					if(is_in_current_group) // ignore this face...
+						continue;
+					// does this face use this rejected vertex?
+					for(int v = 0; v < face.m_indices.size(); v++) {
+						if(face.m_indices[v] == orgpoints[i].m_orgIndex) {
+							// this rejected vertex is used in another face -- reject merge
+							reject_merge = true;
+							break;
+						}
+					}
+					if(reject_merge)
+						break;
+				}
+				if(reject_merge)
+					break;
+			}
+
+			if (!reject_merge)
+			{
+				// do this merge!
+				did_merge = true;
+				m_faces.push_back(combinedFace);
+			}
+		}
+		if(!did_merge)
+		{
+			for (int i=0;i<coplanarFaceGroup.size();i++)
+			{
+				btMyFace face = tmpFaces[coplanarFaceGroup[i]];
+				m_faces.push_back(face);
+			}
+
+		} 
+
+
+
+	}
+
+	initialize();
+
+	return true;
+}
+
+
+
+
+
+
+inline bool IsAlmostZero(const btVector3& v)
+{
+	if(fabsf(v.x())>1e-6 || fabsf(v.y())>1e-6 || fabsf(v.z())>1e-6)	return false;
+	return true;
+}
+
+struct btInternalVertexPair
+{
+	btInternalVertexPair(short int v0,short int v1)
+		:m_v0(v0),
+		m_v1(v1)
+	{
+		if (m_v1>m_v0)
+			btSwap(m_v0,m_v1);
+	}
+	short int m_v0;
+	short int m_v1;
+	int getHash() const
+	{
+		return m_v0+(m_v1<<16);
+	}
+	bool equals(const btInternalVertexPair& other) const
+	{
+		return m_v0==other.m_v0 && m_v1==other.m_v1;
+	}
+};
+
+struct btInternalEdge
+{
+	btInternalEdge()
+		:m_face0(-1),
+		m_face1(-1)
+	{
+	}
+	short int m_face0;
+	short int m_face1;
+};
+
+//
+
+#ifdef TEST_INTERNAL_OBJECTS
+bool btConvexUtility::testContainment() const
+{
+	for(int p=0;p<8;p++)
+	{
+		btVector3 LocalPt;
+		if(p==0)		LocalPt = m_localCenter + btVector3(m_extents[0], m_extents[1], m_extents[2]);
+		else if(p==1)	LocalPt = m_localCenter + btVector3(m_extents[0], m_extents[1], -m_extents[2]);
+		else if(p==2)	LocalPt = m_localCenter + btVector3(m_extents[0], -m_extents[1], m_extents[2]);
+		else if(p==3)	LocalPt = m_localCenter + btVector3(m_extents[0], -m_extents[1], -m_extents[2]);
+		else if(p==4)	LocalPt = m_localCenter + btVector3(-m_extents[0], m_extents[1], m_extents[2]);
+		else if(p==5)	LocalPt = m_localCenter + btVector3(-m_extents[0], m_extents[1], -m_extents[2]);
+		else if(p==6)	LocalPt = m_localCenter + btVector3(-m_extents[0], -m_extents[1], m_extents[2]);
+		else if(p==7)	LocalPt = m_localCenter + btVector3(-m_extents[0], -m_extents[1], -m_extents[2]);
+
+		for(int i=0;i<m_faces.size();i++)
+		{
+			const btVector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const btScalar d = LocalPt.dot(Normal) + m_faces[i].m_plane[3];
+			if(d>0.0f)
+				return false;
+		}
+	}
+	return true;
+}
+#endif
+
+void	btConvexUtility::initialize()
+{
+
+	btHashMap<btInternalVertexPair,btInternalEdge> edges;
+
+	btScalar TotalArea = 0.0f;
+	
+	m_localCenter.setValue(0, 0, 0);
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices;
+		for(int j=0;j<NbTris;j++)
+		{
+			int k = (j+1)%numVertices;
+			btInternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
+			btInternalEdge* edptr = edges.find(vp);
+			btVector3 edge = m_vertices[vp.m_v1]-m_vertices[vp.m_v0];
+			edge.normalize();
+
+			bool found = false;
+
+			for (int p=0;p<m_uniqueEdges.size();p++)
+			{
+				
+				if (IsAlmostZero(m_uniqueEdges[p]-edge) || 
+					IsAlmostZero(m_uniqueEdges[p]+edge))
+				{
+					found = true;
+					break;
+				}
+			}
+
+			if (!found)
+			{
+				m_uniqueEdges.push_back(edge);
+			}
+
+			if (edptr)
+			{
+				btAssert(edptr->m_face0>=0);
+				btAssert(edptr->m_face1<0);
+				edptr->m_face1 = i;
+			} else
+			{
+				btInternalEdge ed;
+				ed.m_face0 = i;
+				edges.insert(vp,ed);
+			}
+		}
+	}
+
+#ifdef USE_CONNECTED_FACES
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		m_faces[i].m_connectedFaces.resize(numVertices);
+
+		for(int j=0;j<numVertices;j++)
+		{
+			int k = (j+1)%numVertices;
+			btInternalVertexPair vp(m_faces[i].m_indices[j],m_faces[i].m_indices[k]);
+			btInternalEdge* edptr = edges.find(vp);
+			btAssert(edptr);
+			btAssert(edptr->m_face0>=0);
+			btAssert(edptr->m_face1>=0);
+
+			int connectedFace = (edptr->m_face0==i)?edptr->m_face1:edptr->m_face0;
+			m_faces[i].m_connectedFaces[j] = connectedFace;
+		}
+	}
+#endif//USE_CONNECTED_FACES
+
+	for(int i=0;i<m_faces.size();i++)
+	{
+		int numVertices = m_faces[i].m_indices.size();
+		int NbTris = numVertices-2;
+		
+		const btVector3& p0 = m_vertices[m_faces[i].m_indices[0]];
+		for(int j=1;j<=NbTris;j++)
+		{
+			int k = (j+1)%numVertices;
+			const btVector3& p1 = m_vertices[m_faces[i].m_indices[j]];
+			const btVector3& p2 = m_vertices[m_faces[i].m_indices[k]];
+			btScalar Area = ((p0 - p1).cross(p0 - p2)).length() * 0.5f;
+			btVector3 Center = (p0+p1+p2)/3.0f;
+			m_localCenter += Area * Center;
+			TotalArea += Area;
+		}
+	}
+	m_localCenter /= TotalArea;
+
+
+
+
+#ifdef TEST_INTERNAL_OBJECTS
+	if(1)
+	{
+		m_radius = FLT_MAX;
+		for(int i=0;i<m_faces.size();i++)
+		{
+			const btVector3 Normal(m_faces[i].m_plane[0], m_faces[i].m_plane[1], m_faces[i].m_plane[2]);
+			const btScalar dist = btFabs(m_localCenter.dot(Normal) + m_faces[i].m_plane[3]);
+			if(dist<m_radius)
+				m_radius = dist;
+		}
+
+	
+		btScalar MinX = FLT_MAX;
+		btScalar MinY = FLT_MAX;
+		btScalar MinZ = FLT_MAX;
+		btScalar MaxX = -FLT_MAX;
+		btScalar MaxY = -FLT_MAX;
+		btScalar MaxZ = -FLT_MAX;
+		for(int i=0; i<m_vertices.size(); i++)
+		{
+			const btVector3& pt = m_vertices[i];
+			if(pt.x()<MinX)	MinX = pt.x();
+			if(pt.x()>MaxX)	MaxX = pt.x();
+			if(pt.y()<MinY)	MinY = pt.y();
+			if(pt.y()>MaxY)	MaxY = pt.y();
+			if(pt.z()<MinZ)	MinZ = pt.z();
+			if(pt.z()>MaxZ)	MaxZ = pt.z();
+		}
+		mC.setValue(MaxX+MinX, MaxY+MinY, MaxZ+MinZ);
+		mE.setValue(MaxX-MinX, MaxY-MinY, MaxZ-MinZ);
+
+
+
+//		const btScalar r = m_radius / sqrtf(2.0f);
+		const btScalar r = m_radius / sqrtf(3.0f);
+		const int LargestExtent = mE.maxAxis();
+		const btScalar Step = (mE[LargestExtent]*0.5f - r)/1024.0f;
+		m_extents[0] = m_extents[1] = m_extents[2] = r;
+		m_extents[LargestExtent] = mE[LargestExtent]*0.5f;
+		bool FoundBox = false;
+		for(int j=0;j<1024;j++)
+		{
+			if(testContainment())
+			{
+				FoundBox = true;
+				break;
+			}
+
+			m_extents[LargestExtent] -= Step;
+		}
+		if(!FoundBox)
+		{
+			m_extents[0] = m_extents[1] = m_extents[2] = r;
+		}
+		else
+		{
+			// Refine the box
+			const btScalar Step = (m_radius - r)/1024.0f;
+			const int e0 = (1<<LargestExtent) & 3;
+			const int e1 = (1<<e0) & 3;
+
+			for(int j=0;j<1024;j++)
+			{
+				const btScalar Saved0 = m_extents[e0];
+				const btScalar Saved1 = m_extents[e1];
+				m_extents[e0] += Step;
+				m_extents[e1] += Step;
+
+				if(!testContainment())
+				{
+					m_extents[e0] = Saved0;
+					m_extents[e1] = Saved1;
+					break;
+				}
+			}
+		}
+	}
+#endif
+}
diff --git a/opencl/gpu_sat/host/btConvexUtility.h b/opencl/gpu_sat/host/btConvexUtility.h
new file mode 100644
index 000000000..662f5c3b9
--- /dev/null
+++ b/opencl/gpu_sat/host/btConvexUtility.h
@@ -0,0 +1,62 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef _BT_CONVEX_UTILITY_H
+#define _BT_CONVEX_UTILITY_H
+
+#include "parallel_primitives/host/btAlignedObjectArray.h"
+#include "parallel_primitives/host/btTransform.h"
+
+#include "btConvexPolyhedronCL.h"
+
+
+struct btMyFace
+{
+	btAlignedObjectArray<int>	m_indices;
+	btScalar	m_plane[4];
+};
+
+ATTRIBUTE_ALIGNED16(class) btConvexUtility
+{
+	public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	btVector3		m_localCenter;
+	btVector3		m_extents;
+	btVector3		mC;
+	btVector3		mE;
+	btScalar		m_radius;
+	
+	btAlignedObjectArray<btVector3>	m_vertices;
+	btAlignedObjectArray<btMyFace>	m_faces;
+	btAlignedObjectArray<btVector3> m_uniqueEdges;
+
+		
+	btConvexUtility()
+	{
+	}
+	virtual ~btConvexUtility();
+
+	bool	initializePolyhedralFeatures(const btVector3* orgVertices, int numVertices, bool mergeCoplanarTriangles=true);
+		
+	void	initialize();
+	bool testContainment() const;
+
+
+
+};
+#endif
+	
\ No newline at end of file
diff --git a/opencl/gpu_sat/host/btRigidBodyCL.h b/opencl/gpu_sat/host/btRigidBodyCL.h
new file mode 100644
index 000000000..9bf0f633d
--- /dev/null
+++ b/opencl/gpu_sat/host/btRigidBodyCL.h
@@ -0,0 +1,35 @@
+#ifndef BT_RIGID_BODY_CL
+#define BT_RIGID_BODY_CL
+
+#include "parallel_primitives/host/btScalar.h"
+#include "parallel_primitives/host/btMatrix3x3.h"
+
+ATTRIBUTE_ALIGNED16(struct) btRigidBodyCL
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	btVector3 		m_pos;
+	btQuaternion 	m_quat;
+	btVector3			m_linVel;
+	btVector3			m_angVel;
+
+	int 					m_collidableIdx;
+	float 				m_invMass;
+	float 				m_restituitionCoeff;
+	float 				m_frictionCoeff;
+
+	float	getInvMass() const
+	{
+			return m_invMass;
+	}
+};
+
+
+struct Inertia
+{
+	btMatrix3x3 m_invInertiaWorld;
+	btMatrix3x3 m_initInvInertia;
+};
+
+
+#endif//BT_RIGID_BODY_CL
diff --git a/opencl/gpu_sat/kernels/sat.cl b/opencl/gpu_sat/kernels/sat.cl
new file mode 100644
index 000000000..3eb1f8faf
--- /dev/null
+++ b/opencl/gpu_sat/kernels/sat.cl
@@ -0,0 +1,1262 @@
+//keep this enum in sync with the CPU version (in AdlCollisionShape.h)
+#define SHAPE_CONVEX_HULL 3
+#define SHAPE_CONCAVE_TRIMESH 5
+#define TRIANGLE_NUM_CONVEX_FACES 5
+#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
+
+
+
+typedef unsigned int u32;
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	int m_numChildShapes;
+	int blaat2;
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+
+typedef struct  
+{
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+} ConvexPolyhedronCL;
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+typedef struct
+{
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+
+#define make_float4 (float4)
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+
+	
+//	float4 a1 = make_float4(a.xyz,0.f);
+//	float4 b1 = make_float4(b.xyz,0.f);
+
+//	return cross(a1,b1);
+
+//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);
+	
+	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);
+	
+	//return c;
+}
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	v = make_float4(v.xyz,0.f);
+	return fast_normalize(v);
+}
+
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+}
+
+inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, 
+const float4* dir, __global const float4* vertices, float* min, float* max)
+{
+	min[0] = FLT_MAX;
+	max[0] = -FLT_MAX;
+	int numVerts = hull->m_numVertices;
+
+	const float4 localDir = qtInvRotate(orn,*dir);
+	float offset = dot(pos,*dir);
+	for(int i=0;i<numVerts;i++)
+	{
+		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);
+		if(dp < min[0])	
+			min[0] = dp;
+		if(dp > max[0])	
+			max[0] = dp;
+	}
+	if(min[0]>max[0])
+	{
+		float tmp = min[0];
+		min[0] = max[0];
+		max[0] = tmp;
+	}
+	min[0] += offset;
+	max[0] += offset;
+}
+
+inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)
+{
+	float Min0,Max0;
+	float Min1,Max1;
+	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+
+
+inline bool IsAlmostZero(const float4 v)
+{
+	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)
+		return false;
+	return true;
+}
+
+
+
+bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+	int i = get_global_id(0);
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* verticesA, 
+	__global const float4* uniqueEdgesA, 
+	__global const btGpuFace* facesA,
+	__global const int*  indicesA,
+	const float4* verticesB,
+	const float4* uniqueEdgesB, 
+	const btGpuFace* facesB,
+	const int*  indicesB,
+	float4* sep,
+	float* dmin)
+{
+	int i = get_global_id(0);
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	int curPlaneTests=0;
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS *= -1.f;
+			curPlaneTests++;
+			float d;
+			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))
+				return false;
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+
+bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	const float4* verticesA, 
+	const float4* uniqueEdgesA, 
+	const btGpuFace* facesA,
+	const int*  indicesA,
+	__global const float4* verticesB, 
+	__global const float4* uniqueEdgesB, 
+	__global const btGpuFace* facesB,
+	__global const int*  indicesB,
+		float4* sep,
+	float* dmin)
+{
+	int i = get_global_id(0);
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje *= -1.f;
+
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA,const float4 ornA,
+	const float4 posB,const float4 ornB,
+	float4* sep_axis, __global const float4* vertices,float* depth)
+{
+	float Min0,Max0;
+	float Min1,Max1;
+	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);
+	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);
+
+	if(Max0<Min1 || Max1<Min0)
+		return false;
+
+	float d0 = Max0 - Min1;
+	float d1 = Max1 - Min0;
+	*depth = d0<d1 ? d0:d1;
+	return true;
+}
+
+
+bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+{
+	int i = get_global_id(0);
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+	
+	int curPlaneTests=0;
+
+	{
+		int numFacesA = hullA->m_numFaces;
+		// Test normals from hullA
+		for(int i=0;i<numFacesA;i++)
+		{
+			const float4 normal = faces[hullA->m_faceOffset+i].m_plane;
+			float4 faceANormalWS = qtRotate(ornA,normal);
+	
+			if (dot3F4(DeltaC2,faceANormalWS)<0)
+				faceANormalWS*=-1.f;
+				
+			curPlaneTests++;
+	
+			float d;
+			if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))
+				return false;
+	
+			if(d<*dmin)
+			{
+				*dmin = d;
+				*sep = faceANormalWS;
+			}
+		}
+	}
+
+
+		if((dot3F4(-DeltaC2,*sep))>0.0f)
+		{
+			*sep = -(*sep);
+		}
+	
+	return true;
+}
+
+
+
+
+bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices, 
+	__global const float4* uniqueEdges, 
+	__global const btGpuFace* faces,
+	__global const int*  indices,
+	float4* sep,
+	float* dmin)
+{
+	int i = get_global_id(0);
+
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test edges
+	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)
+	{
+		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];
+		float4 edge0World = qtRotate(ornA,edge0);
+
+		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)
+		{
+			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];
+			float4 edge1World = qtRotate(ornB,edge1);
+
+
+			float4 crossje = cross3(edge0World,edge1World);
+
+			curEdgeEdge++;
+			if(!IsAlmostZero(crossje))
+			{
+				crossje = normalize3(crossje);
+				if (dot3F4(DeltaC2,crossje)<0)
+					crossje*=-1.f;
+					
+				float dist;
+				bool result = true;
+				{
+					float Min0,Max0;
+					float Min1,Max1;
+					project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+					project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+				
+					if(Max0<Min1 || Max1<Min0)
+						result = false;
+				
+					float d0 = Max0 - Min1;
+					float d1 = Max1 - Min0;
+					dist = d0<d1 ? d0:d1;
+					result = true;
+
+				}
+				
+
+				if(dist<*dmin)
+				{
+					*dmin = dist;
+					*sep = crossje;
+				}
+			}
+		}
+
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
+// work-in-progress
+__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global volatile float4* gpuCompoundSepNormalsOut,
+																					__global volatile int* gpuHasCompoundSepNormalsOut,
+																					int numCompoundPairs
+																					)
+{
+
+	int i = get_global_id(0);
+	if (i<numCompoundPairs)
+	{
+		int bodyIndexA = gpuCompoundPairs[i].x;
+		int bodyIndexB = gpuCompoundPairs[i].y;
+
+		int childShapeIndexA = gpuCompoundPairs[i].z;
+		int childShapeIndexB = gpuCompoundPairs[i].w;
+		
+		int collidableIndexA = -1;
+		int collidableIndexB = -1;
+		
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		
+		float4 ornB = rigidBodies[bodyIndexB].m_quat;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+							
+		if (childShapeIndexA >= 0)
+		{
+			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+			float4 newPosA = qtRotate(ornA,childPosA)+posA;
+			float4 newOrnA = qtMul(ornA,childOrnA);
+			posA = newPosA;
+			ornA = newOrnA;
+		} else
+		{
+			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		}
+		
+		if (childShapeIndexB>=0)
+		{
+			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+			float4 newPosB = transform(&childPosB,&posB,&ornB);
+			float4 newOrnB = qtMul(ornB,childOrnB);
+			posB = newPosB;
+			ornB = newOrnB;
+		} else
+		{
+			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+		}
+	
+		gpuHasCompoundSepNormalsOut[i] = 0;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+	
+		int hasSeparatingAxis = 5;
+							
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+		float dmin = FLT_MAX;
+		posA.w = 0.f;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal = make_float4(1,0,0,0);
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+		hasSeparatingAxis = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis = 0;
+			} else//(!sepB)
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);
+				if (sepEE)
+				{
+						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);
+						gpuHasCompoundSepNormalsOut[i] = 1;
+				}//sepEE
+			}//(!sepB)
+		}//(!sepA)
+		
+		
+	}
+		
+}
+
+// work-in-progress
+__kernel void   findCompoundPairsKernel( __global const int2* pairs, 
+	__global const BodyData* rigidBodies, 
+	__global const btCollidableGpu* collidables,
+	__global const ConvexPolyhedronCL* convexShapes, 
+	__global const float4* vertices,
+	__global const float4* uniqueEdges,
+	__global const btGpuFace* faces,
+	__global const int* indices,
+	__global btAabbCL* aabbs,
+	__global const btGpuChildShape* gpuChildShapes,
+	__global volatile int4* gpuCompoundPairsOut,
+	__global volatile int* numCompoundPairsOut,
+	int numPairs,
+	int maxNumCompoundPairsCapacity
+	)
+{
+
+	int i = get_global_id(0);
+
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+		{
+
+			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) 
+			{
+
+				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;
+				for (int c=0;c<numChildrenA;c++)
+				{
+					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;
+					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					float4 ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+					float4 newPosA = qtRotate(ornA,childPosA)+posA;
+					float4 newOrnA = qtMul(ornA,childOrnA);
+
+					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;
+
+					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+						for (int b=0;b<numChildrenB;b++)
+						{
+							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+							float4 newPosB = transform(&childPosB,&posB,&ornB);
+							float4 newOrnB = qtMul(ornB,childOrnB);
+
+							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+
+							if (1)
+							{
+								int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+								float dmin = FLT_MAX;
+								float4 posA = newPosA;
+								posA.w = 0.f;
+								float4 posB = newPosB;
+								posB.w = 0.f;
+								float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+								float4 ornA = newOrnA;
+								float4 c0 = transform(&c0local, &posA, &ornA);
+								float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+								float4 ornB =newOrnB;
+								float4 c1 = transform(&c1local,&posB,&ornB);
+								const float4 DeltaC2 = c0 - c1;
+
+								{//
+									int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+									if (compoundPairIdx<maxNumCompoundPairsCapacity)
+									{
+										gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);
+									}
+								}//
+							}//fi(1)
+						} //for (int b=0
+					}//if (collidables[collidableIndexB].
+					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+					{
+						if (1)
+						{
+							int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+							float dmin = FLT_MAX;
+							float4 posA = newPosA;
+							posA.w = 0.f;
+							float4 posB = rigidBodies[bodyIndexB].m_pos;
+							posB.w = 0.f;
+							float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+							float4 ornA = newOrnA;
+							float4 c0 = transform(&c0local, &posA, &ornA);
+							float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+							float4 ornB = rigidBodies[bodyIndexB].m_quat;
+							float4 c1 = transform(&c1local,&posB,&ornB);
+							const float4 DeltaC2 = c0 - c1;
+
+							{
+								int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+								if (compoundPairIdx<maxNumCompoundPairsCapacity)
+								{
+									gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);
+								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							}//
+						}//fi (1)
+					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+				}//for (int b=0;b<numChildrenB;b++)	
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			{
+				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;
+				for (int b=0;b<numChildrenB;b++)
+				{
+					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;
+					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+					float4 ornB = rigidBodies[bodyIndexB].m_quat;
+					float4 posB = rigidBodies[bodyIndexB].m_pos;
+					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+					float4 newPosB = qtRotate(ornB,childPosB)+posB;
+					float4 newOrnB = qtMul(ornB,childOrnB);
+
+					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;
+
+
+					//////////////////////////////////////
+
+					if (1)
+					{
+						int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+						float dmin = FLT_MAX;
+						float4 posA = rigidBodies[bodyIndexA].m_pos;
+						posA.w = 0.f;
+						float4 posB = newPosB;
+						posB.w = 0.f;
+						float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+						float4 ornA = rigidBodies[bodyIndexA].m_quat;
+						float4 c0 = transform(&c0local, &posA, &ornA);
+						float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+						float4 ornB =newOrnB;
+						float4 c1 = transform(&c1local,&posB,&ornB);
+						const float4 DeltaC2 = c0 - c1;
+						{//
+							int compoundPairIdx = atomic_inc(numCompoundPairsOut);
+							if (compoundPairIdx<maxNumCompoundPairsCapacity)
+							{
+								gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);
+							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)
+						}//
+					}//fi (1)	
+				}//for (int b=0;b<numChildrenB;b++)
+				return;
+			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)
+			return;
+		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))
+	}//i<numPairs
+}
+
+// work-in-progress
+__kernel void   findSeparatingAxisKernel( __global const int2* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global btAabbCL* aabbs,
+																					__global volatile float4* separatingNormals,
+																					__global volatile int* hasSeparatingAxis,
+																					__global int4* concavePairsOut,
+																					__global float4* concaveSeparatingNormalsOut,
+																					__global volatile int* numConcavePairsOut,
+																					int numPairs,
+																					int maxNumConcavePairsCapacity
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+	
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+		
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+		
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))// && (collidables[collidableIndexB].m_shapeType==SHAPE_CONVEX_HULL))
+		{
+	
+			int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+			int numActualConcaveConvexTests = 0;
+			
+			for (int f=0;f<numFacesA;f++)
+			{
+			
+				bool overlap = false;
+				
+				ConvexPolyhedronCL convexPolyhedronA;
+
+			//add 3 vertices of the triangle
+				convexPolyhedronA.m_numVertices = 3;
+				convexPolyhedronA.m_vertexOffset = 0;
+				float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+
+				btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+				float4 triMinAabb, triMaxAabb;
+				btAabbCL triAabb;
+				triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);
+				triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);
+				
+				float4 verticesA[3];
+				for (int i=0;i<3;i++)
+				{
+					int index = indices[face.m_indexOffset+i];
+					float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+					verticesA[i] = vert;
+					localCenter += vert;
+#if 0
+//just in case some implementation doesn't support component-wise min and max for float4
+					if (triAabb.m_min.x > vert.x)
+						triAabb.m_min.x = vert.x;
+					if (triAabb.m_min.y > vert.y)
+						triAabb.m_min.y = vert.y;
+					if (triAabb.m_min.z > vert.z)
+						triAabb.m_min.z = vert.z;
+
+					if (triAabb.m_max.x < vert.x)
+						triAabb.m_max.x = vert.x;
+					if (triAabb.m_max.y < vert.y)
+						triAabb.m_max.y = vert.y;
+					if (triAabb.m_max.z < vert.z)
+						triAabb.m_max.z = vert.z;
+#else				
+					triAabb.m_min = min(triAabb.m_min,vert);		
+					triAabb.m_max = max(triAabb.m_max,vert);		
+#endif					
+				}
+
+				overlap = true;
+				overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;
+				overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;
+				overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;
+					
+				if (overlap)
+				{
+					float dmin = FLT_MAX;
+					int hasSeparatingAxis=5;
+					float4 sepAxis=make_float4(1,2,3,4);
+
+#if 1
+					
+					int localCC=0;
+					numActualConcaveConvexTests++;
+
+					//a triangle has 3 unique edges
+					convexPolyhedronA.m_numUniqueEdges = 3;
+					convexPolyhedronA.m_uniqueEdgesOffset = 0;
+					float4 uniqueEdgesA[3];
+					
+					uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+					uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+					uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+
+
+					convexPolyhedronA.m_faceOffset = 0;
+                                        
+					float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+                                   
+					btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];
+					int indicesA[3+3+2+2+2];
+					int curUsedIndices=0;
+					int fidx=0;
+
+					//front size of triangle
+					{
+						facesA[fidx].m_indexOffset=curUsedIndices;
+						indicesA[0] = 0;
+						indicesA[1] = 1;
+						indicesA[2] = 2;
+						curUsedIndices+=3;
+						float c = face.m_plane.w;
+						facesA[fidx].m_plane.x = normal.x;
+						facesA[fidx].m_plane.y = normal.y;
+						facesA[fidx].m_plane.z = normal.z;
+						facesA[fidx].m_plane.w = c;
+						facesA[fidx].m_numIndices=3;
+					}
+					fidx++;
+					//back size of triangle
+					{
+						facesA[fidx].m_indexOffset=curUsedIndices;
+						indicesA[3]=2;
+						indicesA[4]=1;
+						indicesA[5]=0;
+						curUsedIndices+=3;
+						float c = dot(normal,verticesA[0]);
+						float c1 = -face.m_plane.w;
+						facesA[fidx].m_plane.x = -normal.x;
+						facesA[fidx].m_plane.y = -normal.y;
+						facesA[fidx].m_plane.z = -normal.z;
+						facesA[fidx].m_plane.w = c;
+						facesA[fidx].m_numIndices=3;
+					}
+					fidx++;
+
+					bool addEdgePlanes = true;
+					if (addEdgePlanes)
+					{
+						int numVertices=3;
+						int prevVertex = numVertices-1;
+						for (int i=0;i<numVertices;i++)
+						{
+							float4 v0 = verticesA[i];
+							float4 v1 = verticesA[prevVertex];
+	                                                
+							float4 edgeNormal = normalize(cross(normal,v1-v0));
+							float c = -dot(edgeNormal,v0);
+	
+							facesA[fidx].m_numIndices = 2;
+							facesA[fidx].m_indexOffset=curUsedIndices;
+							indicesA[curUsedIndices++]=i;
+							indicesA[curUsedIndices++]=prevVertex;
+	                                                
+							facesA[fidx].m_plane.x = edgeNormal.x;
+							facesA[fidx].m_plane.y = edgeNormal.y;
+							facesA[fidx].m_plane.z = edgeNormal.z;
+							facesA[fidx].m_plane.w = c;
+							fidx++;
+							prevVertex = i;
+						}
+					}
+					convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+					convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+			
+
+					float4 posA = rigidBodies[bodyIndexA].m_pos;
+					posA.w = 0.f;
+					float4 posB = rigidBodies[bodyIndexB].m_pos;
+					posB.w = 0.f;
+					float4 c0local = convexPolyhedronA.m_localCenter;
+					float4 ornA = rigidBodies[bodyIndexA].m_quat;
+					float4 c0 = transform(&c0local, &posA, &ornA);
+					float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+					float4 ornB =rigidBodies[bodyIndexB].m_quat;
+					float4 c1 = transform(&c1local,&posB,&ornB);
+					const float4 DeltaC2 = c0 - c1;
+					
+					bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+																											rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+																											DeltaC2,
+																											verticesA,uniqueEdgesA,facesA,indicesA,
+																											vertices,uniqueEdges,faces,indices,
+																											&sepAxis,&dmin);
+					hasSeparatingAxis = 4;
+					if (!sepA)
+					{
+						hasSeparatingAxis = 0;
+					} else
+					{
+						bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+																												rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+																												DeltaC2,
+																												vertices,uniqueEdges,faces,indices,
+																												verticesA,uniqueEdgesA,facesA,indicesA,
+																												&sepAxis,&dmin);
+			
+						if (!sepB)
+						{
+							hasSeparatingAxis = 0;
+						} else
+						{
+							bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+																												rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+																												DeltaC2,
+																												verticesA,uniqueEdgesA,facesA,indicesA,
+																												vertices,uniqueEdges,faces,indices,
+																												&sepAxis,&dmin);
+				
+							if (!sepEE)
+							{
+								hasSeparatingAxis = 0;
+							} else
+							{
+								hasSeparatingAxis = 1;
+							}
+						}
+					}	
+#endif
+					
+					if (hasSeparatingAxis)
+					{
+						int pairIdx = atomic_inc(numConcavePairsOut);
+						if (pairIdx<maxNumConcavePairsCapacity)
+						{
+							concavePairsOut[pairIdx].x = bodyIndexA;
+							concavePairsOut[pairIdx].y = bodyIndexB;
+							concavePairsOut[pairIdx].z = f;
+							concavePairsOut[pairIdx].w = 3;
+							sepAxis.w = dmin;
+							concaveSeparatingNormalsOut[pairIdx]=sepAxis;
+						}
+					}
+				}
+			}
+			//todo//??
+			hasSeparatingAxis[i] = 0;
+			return;
+		}		
+
+	
+
+		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))
+		{
+			hasSeparatingAxis[i] = 0;
+			return;
+		}
+			
+
+
+
+		
+		int numFacesA = convexShapes[shapeIndexA].m_numFaces;
+
+		float dmin = FLT_MAX;
+
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexShapes[shapeIndexA].m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+		float4 sepNormal;
+		
+		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																								posB,ornB,
+																								DeltaC2,
+																								vertices,uniqueEdges,faces,
+																								indices,&sepNormal,&dmin);
+		hasSeparatingAxis[i] = 4;
+		if (!sepA)
+		{
+			hasSeparatingAxis[i] = 0;
+		} else
+		{
+			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,
+																									posA,ornA,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+
+			if (!sepB)
+			{
+				hasSeparatingAxis[i] = 0;
+			} else
+			{
+				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,uniqueEdges,faces,
+																									indices,&sepNormal,&dmin);
+				if (!sepEE)
+				{
+					hasSeparatingAxis[i] = 0;
+				} else
+				{
+					hasSeparatingAxis[i] = 1;
+					separatingNormals[i] = sepNormal;
+				}
+			}
+		}
+		
+	}
+
+}
\ No newline at end of file
diff --git a/opencl/gpu_sat/kernels/satClipHullContacts.cl b/opencl/gpu_sat/kernels/satClipHullContacts.cl
new file mode 100644
index 000000000..e5f56a133
--- /dev/null
+++ b/opencl/gpu_sat/kernels/satClipHullContacts.cl
@@ -0,0 +1,2039 @@
+
+#define TRIANGLE_NUM_CONVEX_FACES 5
+
+#define SHAPE_PLANE 4
+#define SHAPE_SPHERE 7
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+#define counter32_t volatile __global int*
+#endif
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+#define max2 max
+#define min2 min
+
+typedef unsigned int u32;
+
+
+
+typedef struct
+{
+	float4 m_worldPos[4];
+	float4 m_worldNormal;	//	w: m_nPoints
+//	float m_restituitionCoeff;
+//	float m_frictionCoeff;
+	u32 m_coeffs;
+	u32 m_batchIdx;
+//	int m_nPoints;
+//	int m_padding0;
+
+	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr
+	int m_bodyBPtrAndSignBit;
+} Contact4;
+
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	int m_numChildShapes;
+	float m_radius;
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+#define GET_NPOINTS(x) (x).m_worldNormal.w
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;	
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+
+typedef struct  
+{
+	float4		m_localCenter;
+	float4		m_extents;
+	float4		mC;
+	float4		mE;
+	
+	float			m_radius;
+	int	m_faceOffset;
+	int m_numFaces;
+	int	m_numVertices;
+	
+	int m_vertexOffset;
+	int	m_uniqueEdgesOffset;
+	int	m_numUniqueEdges;
+	int m_unused;
+
+} ConvexPolyhedronCL;
+
+typedef struct
+{
+	float4 m_plane;
+	int m_indexOffset;
+	int m_numIndices;
+} btGpuFace;
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+__inline
+float fastDiv(float numerator, float denominator)
+{
+	return native_divide(numerator, denominator);	
+//	return numerator/denominator;	
+}
+
+__inline
+float4 fastDiv4(float4 numerator, float4 denominator)
+{
+	return native_divide(numerator, denominator);	
+}
+
+
+__inline
+float4 cross3(float4 a, float4 b)
+{
+	return cross(a,b);
+}
+
+//#define dot3F4 dot
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = make_float4(a.xyz,0.f);
+	float4 b1 = make_float4(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+__inline
+float4 fastNormalize4(float4 v)
+{
+	return fast_normalize(v);
+}
+
+
+///////////////////////////////////////
+//	Quaternion
+///////////////////////////////////////
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b);
+
+__inline
+Quaternion qtNormalize(Quaternion in);
+
+__inline
+float4 qtRotate(Quaternion q, float4 vec);
+
+__inline
+Quaternion qtInvert(Quaternion q);
+
+
+
+
+__inline
+Quaternion qtMul(Quaternion a, Quaternion b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.w*b+b.w*a;
+//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);
+	ans.w = a.w*b.w - dot3F4(a, b);
+	return ans;
+}
+
+__inline
+Quaternion qtNormalize(Quaternion in)
+{
+	return fastNormalize4(in);
+//	in /= length( in );
+//	return in;
+}
+__inline
+float4 qtRotate(Quaternion q, float4 vec)
+{
+	Quaternion qInv = qtInvert( q );
+	float4 vcpy = vec;
+	vcpy.w = 0.f;
+	float4 out = qtMul(qtMul(q,vcpy),qInv);
+	return out;
+}
+
+__inline
+Quaternion qtInvert(Quaternion q)
+{
+	return (Quaternion)(-q.xyz, q.w);
+}
+
+__inline
+float4 qtInvRotate(const Quaternion q, float4 vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)
+{
+	return qtRotate( *orientation, *p ) + (*translation);
+}
+
+
+
+__inline
+float4 normalize3(const float4 a)
+{
+	float4 n = make_float4(a.x, a.y, a.z, 0.f);
+	return fastNormalize4( n );
+}
+
+
+__inline float4 lerp3(const float4 a,const float4 b, float  t)
+{
+	return make_float4(	a.x + (b.x - a.x) * t,
+						a.y + (b.y - a.y) * t,
+						a.z + (b.z - a.z) * t,
+						0.f);
+}
+
+
+
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)
+{
+	
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+    //double-check next test
+    //	if (numVertsIn < 2)
+    //		return 0;
+    
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+    
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+}
+
+
+
+// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut
+int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)
+{
+	
+	int ve;
+	float ds, de;
+	int numVertsOut = 0;
+//double-check next test
+//	if (numVertsIn < 2)
+//		return 0;
+
+	float4 firstVertex=pVtxIn[numVertsIn-1];
+	float4 endVertex = pVtxIn[0];
+	
+	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;
+
+	for (ve = 0; ve < numVertsIn; ve++)
+	{
+		endVertex=pVtxIn[ve];
+
+		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;
+
+		if (ds<0)
+		{
+			if (de<0)
+			{
+				// Start < 0, end < 0, so output endVertex
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+			else
+			{
+				// Start < 0, end >= 0, so output intersection
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+			}
+		}
+		else
+		{
+			if (de<0)
+			{
+				// Start >= 0, end < 0 so output intersection and end
+				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );
+				ppVtxOut[numVertsOut++] = endVertex;
+			}
+		}
+		firstVertex = endVertex;
+		ds = de;
+	}
+	return numVertsOut;
+}
+
+
+int clipFaceAgainstHull(const float4 separatingNormal, __global const ConvexPolyhedronCL* hullA,  
+	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	__global const float4* vertices,
+	__global const btGpuFace* faces,
+	__global const int* indices,
+	float4* contactsOut,
+	int contactCapacity)
+{
+	int numContactsOut = 0;
+
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+				faces[hullA->m_faceOffset+face].m_plane.x, 
+				faces[hullA->m_faceOffset+face].m_plane.y, 
+				faces[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+		
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+
+	btGpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];
+
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];
+		const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = qtRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);
+
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+
+	
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+
+			if (depth <=maxDist)
+			{
+				float4 pointInWorld = pVtxIn[i];
+				//resultOut.addContactPoint(separatingNormal,point,depth);
+				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+			}
+		}
+	}
+
+	return numContactsOut;
+}
+
+
+
+int clipFaceAgainstHullLocalA(const float4 separatingNormal, const ConvexPolyhedronCL* hullA,  
+	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,
+	float4* worldVertsB2, int capacityWorldVertsB2,
+	const float minDist, float maxDist,
+	const float4* verticesA,
+	const btGpuFace* facesA,
+	const int* indicesA,
+	__global const float4* verticesB,
+	__global const btGpuFace* facesB,
+	__global const int* indicesB,
+	float4* contactsOut,
+	int contactCapacity)
+{
+	int numContactsOut = 0;
+
+	float4* pVtxIn = worldVertsB1;
+	float4* pVtxOut = worldVertsB2;
+	
+	int numVertsIn = numWorldVertsB1;
+	int numVertsOut = 0;
+
+	int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+				facesA[hullA->m_faceOffset+face].m_plane.x, 
+				facesA[hullA->m_faceOffset+face].m_plane.y, 
+				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+		
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+			}
+		}
+	}
+	if (closestFaceA<0)
+		return numContactsOut;
+
+	btGpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];
+
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+	int numVerticesA = polyA.m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];
+		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];
+		const float4 edge0 = a - b;
+		const float4 WorldEdge0 = qtRotate(ornA,edge0);
+		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);
+
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = transform(&a,&posA,&ornA);
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		
+		//clip face
+		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);
+		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);
+
+		//btSwap(pVtxIn,pVtxOut);
+		float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsIn = numVertsOut;
+		numVertsOut = 0;
+	}
+
+	
+	// only keep points that are behind the witness face
+	{
+		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);
+		float localPlaneEq = polyA.m_plane.w;
+		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);
+		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);
+		for (int i=0;i<numVertsIn;i++)
+		{
+			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+			if (depth <=minDist)
+			{
+				depth = minDist;
+			}
+
+			if (depth <=maxDist)
+			{
+				float4 pointInWorld = pVtxIn[i];
+				//resultOut.addContactPoint(separatingNormal,point,depth);
+				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+			}
+		}
+	}
+
+	return numContactsOut;
+}
+
+int	clipHullAgainstHull(const float4 separatingNormal,
+	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	__global const float4* vertices,
+	__global const btGpuFace* faces,
+	__global const int* indices,
+	float4*	localContactsOut,
+	int localContactCapacity)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+
+
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, 
+				faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+
+	{
+		const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+
+	if (closestFaceB>=0)
+	{
+		numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,
+				faces,
+				indices,localContactsOut,localContactCapacity);
+	}
+
+	return numContactsOut;
+}
+
+
+int	clipHullAgainstHullLocalA(const float4 separatingNormal,
+	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, 
+	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,
+	const float minDist, float maxDist,
+	const float4* verticesA,
+	const btGpuFace* facesA,
+	const int* indicesA,
+	__global const float4* verticesB,
+	__global const btGpuFace* facesB,
+	__global const int* indicesB,
+	float4*	localContactsOut,
+	int localContactCapacity)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+
+
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, 
+				facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+
+	{
+		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];
+			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+
+	if (closestFaceB>=0)
+	{
+		numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, 
+				posA,ornA,
+				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,
+				verticesA,facesA,indicesA,
+				verticesB,facesB,indicesB,
+				localContactsOut,localContactCapacity);
+	}
+
+	return numContactsOut;
+}
+
+#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];
+#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
+#define REDUCE_MAX(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }
+#define REDUCE_MIN(v, n) {int i=0;\
+for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }
+
+int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)
+{
+	if( nPoints == 0 )
+        return 0;
+    
+    if (nPoints <=4)
+        return nPoints;
+    
+    
+    if (nPoints >64)
+        nPoints = 64;
+    
+	float4 center = make_float4(0.f);
+	{
+		
+		for (int i=0;i<nPoints;i++)
+			center += p[i];
+		center /= (float)nPoints;
+	}
+    
+	
+    
+	//	sample 4 directions
+    
+    float4 aVector = p[0] - center;
+    float4 u = cross3( nearNormal, aVector );
+    float4 v = cross3( nearNormal, u );
+    u = normalize3( u );
+    v = normalize3( v );
+    
+    
+    //keep point with deepest penetration
+    float minW= FLT_MAX;
+    
+    int minIndex=-1;
+    
+    float4 maxDots;
+    maxDots.x = FLT_MIN;
+    maxDots.y = FLT_MIN;
+    maxDots.z = FLT_MIN;
+    maxDots.w = FLT_MIN;
+    
+    //	idx, distance
+    for(int ie = 0; ie<nPoints; ie++ )
+    {
+        if (p[ie].w<minW)
+        {
+            minW = p[ie].w;
+            minIndex=ie;
+        }
+        float f;
+        float4 r = p[ie]-center;
+        f = dot3F4( u, r );
+        if (f<maxDots.x)
+        {
+            maxDots.x = f;
+            contactIdx[0].x = ie;
+        }
+        
+        f = dot3F4( -u, r );
+        if (f<maxDots.y)
+        {
+            maxDots.y = f;
+            contactIdx[0].y = ie;
+        }
+        
+        
+        f = dot3F4( v, r );
+        if (f<maxDots.z)
+        {
+            maxDots.z = f;
+            contactIdx[0].z = ie;
+        }
+        
+        f = dot3F4( -v, r );
+        if (f<maxDots.w)
+        {
+            maxDots.w = f;
+            contactIdx[0].w = ie;
+        }
+        
+    }
+    
+    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)
+    {
+        //replace the first contact with minimum (todo: replace contact with least penetration)
+        contactIdx[0].x = minIndex;
+    }
+    
+    return 4;
+    
+}
+
+
+int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)
+{
+    contactIdx[0] = 0;
+    contactIdx[1] = 1;
+    contactIdx[2] = 2;
+    contactIdx[3] = 3;
+    
+	if( nPoints == 0 ) return 0;
+    
+	nPoints = min2( nPoints, 4 );
+    return nPoints;
+    
+}
+
+
+
+int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)
+{
+	if( nPoints == 0 ) return 0;
+
+	nPoints = min2( nPoints, 64 );
+
+	float4 center = make_float4(0.f);
+	{
+		float4 v[64];
+		for (int i=0;i<nPoints;i++)
+			v[i] = p[i];
+		//memcpy( v, p, nPoints*sizeof(float4) );
+		PARALLEL_SUM( v, nPoints );
+		center = v[0]/(float)nPoints;
+	}
+
+	
+
+	{	//	sample 4 directions
+		if( nPoints < 4 )
+		{
+			for(int i=0; i<nPoints; i++) 
+				contactIdx[i] = i;
+			return nPoints;
+		}
+
+		float4 aVector = p[0] - center;
+		float4 u = cross3( nearNormal, aVector );
+		float4 v = cross3( nearNormal, u );
+		u = normalize3( u );
+		v = normalize3( v );
+
+		int idx[4];
+
+		float2 max00 = make_float2(0,FLT_MAX);
+		{
+			//	idx, distance
+			{
+				{
+					int4 a[64];
+					for(int ie = 0; ie<nPoints; ie++ )
+					{
+						
+						
+						float f;
+						float4 r = p[ie]-center;
+						f = dot3F4( u, r );
+						a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+
+						f = dot3F4( -u, r );
+						a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+
+						f = dot3F4( v, r );
+						a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+
+						f = dot3F4( -v, r );
+						a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);
+					}
+
+					for(int ie=0; ie<nPoints; ie++)
+					{
+						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;
+						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;
+						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;
+						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;
+					}
+
+					idx[0] = (int)a[0].x & 0xff;
+					idx[1] = (int)a[0].y & 0xff;
+					idx[2] = (int)a[0].z & 0xff;
+					idx[3] = (int)a[0].w & 0xff;
+				}
+			}
+
+			{
+				float2 h[64];
+				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );
+				REDUCE_MIN( h, nPoints );
+				max00 = h[0];
+			}
+		}
+
+		contactIdx[0] = idx[0];
+		contactIdx[1] = idx[1];
+		contactIdx[2] = idx[2];
+		contactIdx[3] = idx[3];
+
+//		if( max00.y < 0.0f )
+//			contactIdx[0] = (int)max00.x;
+
+		//does this sort happen on GPU too?
+		//std::sort( contactIdx, contactIdx+4 );
+
+		return 4;
+	}
+}
+
+
+
+__kernel void   extractManifoldAndAddContactKernel(__global const int2* pairs, 
+																	__global const BodyData* rigidBodies, 
+																	__global const float4* closestPointsWorld,
+																	__global const float4* separatingNormalsWorld,
+																	__global const int* contactCounts,
+																	__global const int* contactOffsets,
+																	__global Contact4* restrict contactsOut,
+																	counter32_t nContactsOut,
+																	int numPairs,
+																	int pairIndex
+																	)
+{
+	int idx = get_global_id(0);
+	
+	if (idx<numPairs)
+	{
+		float4 normal = separatingNormalsWorld[idx];
+		int nPoints = contactCounts[idx];
+		__global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];
+		float4 localPoints[64];
+		for (int i=0;i<nPoints;i++)
+		{
+			localPoints[i] = pointsIn[i];
+		}
+//		int contactIdx[4] = {-1,-1,-1,-1};
+		int contactIdx[4];// = {-1,-1,-1,-1};
+		contactIdx[0] = -1;
+		contactIdx[1] = -1;
+		contactIdx[2] = -1;
+		contactIdx[3] = -1;
+
+		int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);
+
+		int dstIdx;
+		AppendInc( nContactsOut, dstIdx );
+		//if ((dstIdx+nContacts) < capacity)
+		{
+			__global Contact4* c = contactsOut + dstIdx;
+			c->m_worldNormal = normal;
+			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+			c->m_batchIdx = idx;
+			int bodyA = pairs[pairIndex].x;
+			int bodyB = pairs[pairIndex].y;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;
+			for (int i=0;i<nContacts;i++)
+			{
+				c->m_worldPos[i] = localPoints[contactIdx[i]];
+			}
+			GET_NPOINTS(*c) = nContacts;
+		}
+	}
+}
+
+
+void	trInverse(float4 translationIn, Quaternion orientationIn,
+		float4* translationOut, Quaternion* orientationOut)
+{
+	*orientationOut = qtInvert(orientationIn);
+	*translationOut = qtRotate(*orientationOut, -translationIn);
+}
+
+void	trMul(float4 translationA, Quaternion orientationA,
+						float4 translationB, Quaternion orientationB,
+		float4* translationOut, Quaternion* orientationOut)
+{
+	*orientationOut = qtMul(orientationA,orientationB);
+	*translationOut = transform(&translationB,&translationA,&orientationA);
+}
+
+void	computeContactPlaneConvex(int pairIndex,
+																int bodyIndexA, int bodyIndexB, 
+																int collidableIndexA, int collidableIndexB, 
+																__global const BodyData* rigidBodies, 
+																__global const btCollidableGpu* collidables,
+																__global const btGpuFace* faces,
+																__global Contact4* restrict globalContactsOut,
+																counter32_t nGlobalContactsOut,
+																int numPairs)
+{
+	float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;
+	float radius = collidables[collidableIndexB].m_radius;
+	float4 posA1 = rigidBodies[bodyIndexA].m_pos;
+	float4 ornA1 = rigidBodies[bodyIndexA].m_quat;
+	float4 posB1 = rigidBodies[bodyIndexB].m_pos;
+	float4 ornB1 = rigidBodies[bodyIndexB].m_quat;
+	
+	bool hasCollision = false;
+	float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);
+	float planeConstant = planeEq.w;
+	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;
+	{
+		float4 invPosA;Quaternion invOrnA;
+		trInverse(posA1,ornA1,&invPosA,&invOrnA);
+		trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	}
+	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;
+	{
+		float4 invPosB;Quaternion invOrnB;
+		trInverse(posB1,ornB1,&invPosB,&invOrnB);
+		trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1);	
+	}
+	float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;
+	float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);
+	float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;
+	hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();
+	if (hasCollision)
+	{
+		float4 vtxInPlaneProjected1 = vtxInPlane1 -   distance*planeNormal1;
+		float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);
+		float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);
+		float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;
+		pOnB1.w = distance;
+
+		int dstIdx;
+    AppendInc( nGlobalContactsOut, dstIdx );
+		
+		if (dstIdx < numPairs)
+		{
+			__global Contact4* c = &globalContactsOut[dstIdx];
+			c->m_worldNormal = normalOnSurfaceB1;
+			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+			c->m_batchIdx = pairIndex;
+			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;
+			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;
+			c->m_worldPos[0] = pOnB1;
+			GET_NPOINTS(*c) = 1;
+		}//if (dstIdx < numPairs)
+	}//if (hasCollision)
+}
+
+
+__kernel void   clipHullHullKernel( __global const int2* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global const float4* separatingNormals,
+																					__global const int* hasSeparatingAxis,
+																					__global Contact4* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numPairs)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	if (i<numPairs)
+	{
+
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+			
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)
+		{
+
+
+			computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, 
+																rigidBodies,collidables,faces,	globalContactsOut, nGlobalContactsOut,numPairs);
+			return;
+		}
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+
+
+			computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, 
+																rigidBodies,collidables,faces, globalContactsOut, nGlobalContactsOut,numPairs);
+			return;
+			
+		}
+	
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			//sphere-sphere
+			float radiusA = collidables[collidableIndexA].m_radius;
+			float radiusB = collidables[collidableIndexB].m_radius;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+
+			float4 diff = posA-posB;
+			float len = length(diff);
+			
+			///iff distance positive, don't generate a new contact
+			if ( len <= (radiusA+radiusB))
+			{
+				///distance (negative means penetration)
+				float dist = len - (radiusA+radiusB);
+				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);
+				if (len > 0.00001)
+				{
+					normalOnSurfaceB = diff / len;
+				}
+				float4 contactPosB = posB + normalOnSurfaceB*radiusB;
+				contactPosB.w = dist;
+								
+				int dstIdx;
+        AppendInc( nGlobalContactsOut, dstIdx );
+				
+				if (dstIdx < numPairs)
+				{
+					__global Contact4* c = &globalContactsOut[dstIdx];
+					c->m_worldNormal = -normalOnSurfaceB;
+					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_worldPos[0] = contactPosB;
+					GET_NPOINTS(*c) = 1;
+				}//if (dstIdx < numPairs)
+			}//if ( len <= (radiusA+radiusB))
+
+			return;
+		}//SHAPE_SPHERE SHAPE_SPHERE
+
+		if (hasSeparatingAxis[i])
+		{
+
+			
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			
+
+
+		
+			int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],
+														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+												
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -separatingNormals[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				int contactIdx[4];// = {-1,-1,-1,-1};
+
+				contactIdx[0] = -1;
+				contactIdx[1] = -1;
+				contactIdx[2] = -1;
+				contactIdx[3] = -1;
+		
+				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+		
+				int dstIdx;
+				AppendInc( nGlobalContactsOut, dstIdx );
+				//if ((dstIdx+nReducedContacts) < capacity)
+				{
+					__global Contact4* c = globalContactsOut+ dstIdx;
+					c->m_worldNormal = normal;
+					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+
+					for (int i=0;i<nReducedContacts;i++)
+					{
+						c->m_worldPos[i] = pointsIn[contactIdx[i]];
+					}
+					GET_NPOINTS(*c) = nReducedContacts;
+				}
+				
+			}//		if (numContactsOut>0)
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+
+}
+
+
+__kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global const btGpuChildShape* gpuChildShapes,
+																					__global const float4* gpuCompoundSepNormalsOut,
+																					__global const int* gpuHasCompoundSepNormalsOut,
+																					__global Contact4* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numCompoundPairs)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	if (i<numCompoundPairs)
+	{
+
+		if (gpuHasCompoundSepNormalsOut[i])
+		{
+
+			int bodyIndexA = gpuCompoundPairs[i].x;
+			int bodyIndexB = gpuCompoundPairs[i].y;
+			
+			int childShapeIndexA = gpuCompoundPairs[i].z;
+			int childShapeIndexB = gpuCompoundPairs[i].w;
+			
+			int collidableIndexA = -1;
+			int collidableIndexB = -1;
+			
+			float4 ornA = rigidBodies[bodyIndexA].m_quat;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			
+			float4 ornB = rigidBodies[bodyIndexB].m_quat;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+								
+			if (childShapeIndexA >= 0)
+			{
+				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;
+				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;
+				float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;
+				float4 newPosA = qtRotate(ornA,childPosA)+posA;
+				float4 newOrnA = qtMul(ornA,childOrnA);
+				posA = newPosA;
+				ornA = newOrnA;
+			} else
+			{
+				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			}
+			
+			if (childShapeIndexB>=0)
+			{
+				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;
+				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;
+				float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;
+				float4 newPosB = transform(&childPosB,&posB,&ornB);
+				float4 newOrnB = qtMul(ornB,childOrnB);
+				posB = newPosB;
+				ornB = newOrnB;
+			} else
+			{
+				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	
+			}
+			
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+
+			
+
+
+		
+			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],
+														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+														posA,ornA,
+													  posB,ornB,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+												
+		if (numLocalContactsOut>0)
+		{
+				float4 normal = -gpuCompoundSepNormalsOut[i];
+				int nPoints = numLocalContactsOut;
+				float4* pointsIn = localContactsOut;
+				int contactIdx[4];// = {-1,-1,-1,-1};
+
+				contactIdx[0] = -1;
+				contactIdx[1] = -1;
+				contactIdx[2] = -1;
+				contactIdx[3] = -1;
+		
+				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+		
+				int dstIdx;
+				AppendInc( nGlobalContactsOut, dstIdx );
+				//if ((dstIdx+nReducedContacts) < capacity)
+				{
+					__global Contact4* c = globalContactsOut+ dstIdx;
+					c->m_worldNormal = normal;
+					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_batchIdx = pairIndex;
+					int bodyA = gpuCompoundPairs[pairIndex].x;
+					int bodyB = gpuCompoundPairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+
+					for (int i=0;i<nReducedContacts;i++)
+					{
+						c->m_worldPos[i] = pointsIn[contactIdx[i]];
+					}
+					GET_NPOINTS(*c) = nReducedContacts;
+				}
+				
+			}//		if (numContactsOut>0)
+		}//		if (gpuHasCompoundSepNormalsOut[i])
+	}//	if (i<numCompoundPairs)
+
+}
+
+
+
+__kernel void   sphereSphereCollisionKernel( __global const int2* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const float4* separatingNormals,
+																					__global const int* hasSeparatingAxis,
+																					__global Contact4* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numPairs)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	if (i<numPairs)
+	{
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+			
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+
+		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&
+			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)
+		{
+			//sphere-sphere
+			float radiusA = collidables[collidableIndexA].m_radius;
+			float radiusB = collidables[collidableIndexB].m_radius;
+			float4 posA = rigidBodies[bodyIndexA].m_pos;
+			float4 posB = rigidBodies[bodyIndexB].m_pos;
+
+			float4 diff = posA-posB;
+			float len = length(diff);
+			
+			///iff distance positive, don't generate a new contact
+			if ( len <= (radiusA+radiusB))
+			{
+				///distance (negative means penetration)
+				float dist = len - (radiusA+radiusB);
+				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);
+				if (len > 0.00001)
+				{
+					normalOnSurfaceB = diff / len;
+				}
+				float4 contactPosB = posB + normalOnSurfaceB*radiusB;
+				contactPosB.w = dist;
+								
+				int dstIdx;
+        AppendInc( nGlobalContactsOut, dstIdx );
+				
+				if (dstIdx < numPairs)
+				{
+					__global Contact4* c = &globalContactsOut[dstIdx];
+					c->m_worldNormal = normalOnSurfaceB;
+					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+					c->m_worldPos[0] = contactPosB;
+					GET_NPOINTS(*c) = 1;
+				}//if (dstIdx < numPairs)
+			}//if ( len <= (radiusA+radiusB))
+		}//SHAPE_SPHERE SHAPE_SPHERE
+	}//if (i<numPairs)
+}				
+
+__kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global const ConvexPolyhedronCL* convexShapes, 
+																					__global const float4* vertices,
+																					__global const float4* uniqueEdges,
+																					__global const btGpuFace* faces,
+																					__global const int* indices,
+																					__global const float4* separatingNormals,
+																					__global Contact4* restrict globalContactsOut,
+																					counter32_t nGlobalContactsOut,
+																					int numConcavePairs)
+{
+
+	int i = get_global_id(0);
+	int pairIndex = i;
+	
+	float4 worldVertsB1[64];
+	float4 worldVertsB2[64];
+	int capacityWorldVerts = 64;	
+
+	float4 localContactsOut[64];
+	int localContactCapacity=64;
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+
+	if (i<numConcavePairs)
+	{
+
+		int bodyIndexA = concavePairsIn[i].x;
+		int bodyIndexB = concavePairsIn[i].y;
+		int f = concavePairsIn[i].z;
+		
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+		
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+		///////////////////////////////////////////////////////////////
+		
+	
+		bool overlap = false;
+		
+		ConvexPolyhedronCL convexPolyhedronA;
+
+	//add 3 vertices of the triangle
+		convexPolyhedronA.m_numVertices = 3;
+		convexPolyhedronA.m_vertexOffset = 0;
+		float4	localCenter = make_float4(0.f,0.f,0.f,0.f);
+
+		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];
+		
+		float4 verticesA[3];
+		for (int i=0;i<3;i++)
+		{
+			int index = indices[face.m_indexOffset+i];
+			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];
+			verticesA[i] = vert;
+			localCenter += vert;
+		}
+
+		float dmin = FLT_MAX;
+
+		int localCC=0;
+
+		//a triangle has 3 unique edges
+		convexPolyhedronA.m_numUniqueEdges = 3;
+		convexPolyhedronA.m_uniqueEdgesOffset = 0;
+		float4 uniqueEdgesA[3];
+		
+		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);
+		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);
+		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);
+
+
+		convexPolyhedronA.m_faceOffset = 0;
+                                  
+		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);
+                             
+		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];
+		int indicesA[3+3+2+2+2];
+		int curUsedIndices=0;
+		int fidx=0;
+
+		//front size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[0] = 0;
+			indicesA[1] = 1;
+			indicesA[2] = 2;
+			curUsedIndices+=3;
+			float c = face.m_plane.w;
+			facesA[fidx].m_plane.x = normal.x;
+			facesA[fidx].m_plane.y = normal.y;
+			facesA[fidx].m_plane.z = normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+		//back size of triangle
+		{
+			facesA[fidx].m_indexOffset=curUsedIndices;
+			indicesA[3]=2;
+			indicesA[4]=1;
+			indicesA[5]=0;
+			curUsedIndices+=3;
+			float c = dot3F4(normal,verticesA[0]);
+			float c1 = -face.m_plane.w;
+			facesA[fidx].m_plane.x = -normal.x;
+			facesA[fidx].m_plane.y = -normal.y;
+			facesA[fidx].m_plane.z = -normal.z;
+			facesA[fidx].m_plane.w = c;
+			facesA[fidx].m_numIndices=3;
+		}
+		fidx++;
+
+		bool addEdgePlanes = true;
+		if (addEdgePlanes)
+		{
+			int numVertices=3;
+			int prevVertex = numVertices-1;
+			for (int i=0;i<numVertices;i++)
+			{
+				float4 v0 = verticesA[i];
+				float4 v1 = verticesA[prevVertex];
+                                            
+				float4 edgeNormal = normalize(cross(normal,v1-v0));
+				float c = -dot3F4(edgeNormal,v0);
+
+				facesA[fidx].m_numIndices = 2;
+				facesA[fidx].m_indexOffset=curUsedIndices;
+				indicesA[curUsedIndices++]=i;
+				indicesA[curUsedIndices++]=prevVertex;
+                                            
+				facesA[fidx].m_plane.x = edgeNormal.x;
+				facesA[fidx].m_plane.y = edgeNormal.y;
+				facesA[fidx].m_plane.z = edgeNormal.z;
+				facesA[fidx].m_plane.w = c;
+				fidx++;
+				prevVertex = i;
+			}
+		}
+		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;
+		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);
+
+
+		float4 posA = rigidBodies[bodyIndexA].m_pos;
+		posA.w = 0.f;
+		float4 posB = rigidBodies[bodyIndexB].m_pos;
+		posB.w = 0.f;
+		float4 c0local = convexPolyhedronA.m_localCenter;
+		float4 ornA = rigidBodies[bodyIndexA].m_quat;
+		float4 c0 = transform(&c0local, &posA, &ornA);
+		float4 c1local = convexShapes[shapeIndexB].m_localCenter;
+		float4 ornB =rigidBodies[bodyIndexB].m_quat;
+		float4 c1 = transform(&c1local,&posB,&ornB);
+		const float4 DeltaC2 = c0 - c1;
+
+		float4 sepAxis = separatingNormals[i];
+		
+		
+		////////////////////////////////////////
+		
+		
+		
+		int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,
+														&convexPolyhedronA, &convexShapes[shapeIndexB],
+														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+													  worldVertsB1,worldVertsB2,capacityWorldVerts,
+														minDist, maxDist,
+														&verticesA,&facesA,&indicesA,
+														vertices,faces,indices,
+														localContactsOut,localContactCapacity);
+												
+		if (numLocalContactsOut>0)
+		{
+			float4 normal = -separatingNormals[i];
+			int nPoints = numLocalContactsOut;
+			float4* pointsIn = localContactsOut;
+			int contactIdx[4];// = {-1,-1,-1,-1};
+
+			contactIdx[0] = -1;
+			contactIdx[1] = -1;
+			contactIdx[2] = -1;
+			contactIdx[3] = -1;
+	
+			int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);
+	
+			int dstIdx;
+			AppendInc( nGlobalContactsOut, dstIdx );
+			//if ((dstIdx+nReducedContacts) < capacity)
+			{
+				__global Contact4* c = globalContactsOut+ dstIdx;
+				c->m_worldNormal = normal;
+				c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+				c->m_batchIdx = pairIndex;
+				int bodyA = concavePairsIn[pairIndex].x;
+				int bodyB = concavePairsIn[pairIndex].y;
+				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+
+				for (int i=0;i<nReducedContacts;i++)
+				{
+					c->m_worldPos[i] = pointsIn[contactIdx[i]];
+				}
+				GET_NPOINTS(*c) = nReducedContacts;
+			}
+				
+		}//		if (numContactsOut>0)
+	}//	if (i<numPairs)
+}
+
+
+
+
+
+
+int	findClippingFaces(const float4 separatingNormal,
+                      __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB,
+                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,
+                       __global float4* worldVertsA1,
+                      __global float4* worldNormalsA1,
+                      __global float4* worldVertsB1,
+                      int capacityWorldVerts,
+                      const float minDist, float maxDist,
+                      __global const float4* vertices,
+                      __global const btGpuFace* faces,
+                      __global const int* indices,
+                      __global int4* clippingFaces, int pairIndex)
+{
+	int numContactsOut = 0;
+	int numWorldVertsB1= 0;
+    
+    
+	int closestFaceB=-1;
+	float dmax = -FLT_MAX;
+    
+	{
+		for(int face=0;face<hullB->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,
+                                              faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);
+			const float4 WorldNormal = qtRotate(ornB, Normal);
+			float d = dot3F4(WorldNormal,separatingNormal);
+			if (d > dmax)
+			{
+				dmax = d;
+				closestFaceB = face;
+			}
+		}
+	}
+    
+	{
+		const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];
+		const int numVertices = polyB.m_numIndices;
+		for(int e0=0;e0<numVertices;e0++)
+		{
+			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];
+			worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);
+		}
+	}
+    
+    int closestFaceA=-1;
+	{
+		float dmin = FLT_MAX;
+		for(int face=0;face<hullA->m_numFaces;face++)
+		{
+			const float4 Normal = make_float4(
+                                              faces[hullA->m_faceOffset+face].m_plane.x,
+                                              faces[hullA->m_faceOffset+face].m_plane.y,
+                                              faces[hullA->m_faceOffset+face].m_plane.z,
+                                              0.f);
+			const float4 faceANormalWS = qtRotate(ornA,Normal);
+            
+			float d = dot3F4(faceANormalWS,separatingNormal);
+			if (d < dmin)
+			{
+				dmin = d;
+				closestFaceA = face;
+                worldNormalsA1[pairIndex] = faceANormalWS;
+			}
+		}
+	}
+    
+    int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;
+	for(int e0=0;e0<numVerticesA;e0++)
+	{
+        const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];
+        worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);
+    }
+    
+    clippingFaces[pairIndex].x = closestFaceA;
+    clippingFaces[pairIndex].y = closestFaceB;
+    clippingFaces[pairIndex].z = numVerticesA;
+    clippingFaces[pairIndex].w = numWorldVertsB1;
+    
+    
+	return numContactsOut;
+}
+
+
+
+int clipFaces(__global float4* worldVertsA1,
+              __global float4* worldNormalsA1,
+              __global float4* worldVertsB1,
+              __global float4* worldVertsB2, 
+              int capacityWorldVertsB2,
+              const float minDist, float maxDist,
+              __global int4* clippingFaces,
+              int pairIndex)
+{
+	int numContactsOut = 0;
+    
+    int closestFaceA = clippingFaces[pairIndex].x;
+    int closestFaceB = clippingFaces[pairIndex].y;
+	int numVertsInA = clippingFaces[pairIndex].z;
+	int numVertsInB = clippingFaces[pairIndex].w;
+    
+	int numVertsOut = 0;
+    
+	if (closestFaceA<0)
+		return numContactsOut;
+    
+    __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];
+    __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];
+    
+    
+	
+	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+    
+	for(int e0=0;e0<numVertsInA;e0++)
+	{
+		const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];
+		const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];
+		const float4 WorldEdge0 = aw - bw;
+		float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+		float4 worldA1 = aw;
+		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+		float4 planeNormalWS = planeNormalWS1;
+		float planeEqWS=planeEqWS1;
+		numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);
+		__global float4* tmp = pVtxOut;
+		pVtxOut = pVtxIn;
+		pVtxIn = tmp;
+		numVertsInB = numVertsOut;
+		numVertsOut = 0;
+	}
+    
+    //float4 planeNormalWS = worldNormalsA1[pairIndex];
+    //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+
+
+    
+    /*for (int i=0;i<numVertsInB;i++)
+    {
+        pVtxOut[i] = pVtxIn[i];
+    }*/
+    
+    
+    
+    
+    //numVertsInB=0;
+	
+    float4 planeNormalWS = worldNormalsA1[pairIndex];
+    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+
+    for (int i=0;i<numVertsInB;i++)
+    {
+        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+        if (depth <=minDist)
+        {
+            depth = minDist;
+        }
+        
+        if (depth <=maxDist)
+        {
+            float4 pointInWorld = pVtxIn[i];
+            pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+        }
+    }
+   
+    clippingFaces[pairIndex].w =numContactsOut;
+   
+    
+	return numContactsOut;
+
+}
+
+
+
+
+__kernel void   findClippingFacesKernel(  __global const int2* pairs,
+                                        __global const BodyData* rigidBodies,
+                                        __global const btCollidableGpu* collidables,
+                                        __global const ConvexPolyhedronCL* convexShapes,
+                                        __global const float4* vertices,
+                                        __global const float4* uniqueEdges,
+                                        __global const btGpuFace* faces,
+                                        __global const int* indices,
+                                        __global const float4* separatingNormals,
+                                        __global const int* hasSeparatingAxis,
+                                        __global int4* clippingFacesOut,
+                                        __global float4* worldVertsA1,
+                                        __global float4* worldNormalsA1,
+                                        __global float4* worldVertsB1,
+                                        int capacityWorldVerts,
+                                        int numPairs
+                                        )
+{
+    
+	int i = get_global_id(0);
+	int pairIndex = i;
+    
+	
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+    
+	if (i<numPairs)
+	{
+        
+		if (hasSeparatingAxis[i])
+		{
+            
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+			
+			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+			
+			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+			
+            
+            
+			int numLocalContactsOut = findClippingFaces(separatingNormals[i],
+                                                        &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],
+                                                        rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,
+                                                        rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,
+                                                        worldVertsA1,
+                                                        worldNormalsA1,
+                                                        worldVertsB1,capacityWorldVerts,
+                                                        minDist, maxDist,
+                                                        vertices,faces,indices,
+                                                        clippingFacesOut,i);
+            
+            
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+    
+}
+
+
+
+
+__kernel void   clipFacesAndContactReductionKernel( __global const int2* pairs,
+                                                   __global const BodyData* rigidBodies,
+                                                   __global const float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                     __global Contact4* globalContactsOut,
+                                                   __global int4* clippingFacesOut,
+                                                   __global float4* worldVertsA1,
+                                                   __global float4* worldNormalsA1,
+                                                   __global float4* worldVertsB1,
+                                                   __global float4* worldVertsB2,
+                                                   counter32_t nGlobalContactsOut,
+                                                    int vertexFaceCapacity,
+                                                   int numPairs,
+					                                        int debugMode
+                                                   )
+{
+    int i = get_global_id(0);
+	int pairIndex = i;
+	
+    
+	float minDist = -1e30f;
+	float maxDist = 0.02f;
+    
+	if (i<numPairs)
+	{
+        
+		if (hasSeparatingAxis[i])
+		{
+            
+			int bodyIndexA = pairs[i].x;
+			int bodyIndexB = pairs[i].y;
+		    
+            int numLocalContactsOut = 0;
+
+            int capacityWorldVertsB2 = vertexFaceCapacity;
+            
+            __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];
+            __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];
+            
+
+            {
+                __global int4* clippingFaces = clippingFacesOut;
+            
+                
+                int closestFaceA = clippingFaces[pairIndex].x;
+                int closestFaceB = clippingFaces[pairIndex].y;
+                int numVertsInA = clippingFaces[pairIndex].z;
+                int numVertsInB = clippingFaces[pairIndex].w;
+                
+                int numVertsOut = 0;
+                
+                if (closestFaceA>=0)
+                {
+                    
+                    
+                    
+                    // clip polygon to back of planes of all faces of hull A that are adjacent to witness face
+                    
+                    for(int e0=0;e0<numVertsInA;e0++)
+                    {
+                        const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];
+                        const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];
+                        const float4 WorldEdge0 = aw - bw;
+                        float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];
+                        float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);
+                        float4 worldA1 = aw;
+                        float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);
+                        float4 planeNormalWS = planeNormalWS1;
+                        float planeEqWS=planeEqWS1;
+                        numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);
+                        __global float4* tmp = pVtxOut;
+                        pVtxOut = pVtxIn;
+                        pVtxIn = tmp;
+                        numVertsInB = numVertsOut;
+                        numVertsOut = 0;
+                    }
+                    
+                    float4 planeNormalWS = worldNormalsA1[pairIndex];
+                    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);
+                    
+                    for (int i=0;i<numVertsInB;i++)
+                    {
+                        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;
+                        if (depth <=minDist)
+                        {
+                            depth = minDist;
+                        }
+                        
+                        if (depth <=maxDist)
+                        {
+                            float4 pointInWorld = pVtxIn[i];
+                            pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);
+                        }
+                    }
+                    
+                }
+                clippingFaces[pairIndex].w =numLocalContactsOut;
+                
+
+            }
+            
+            for (int i=0;i<numLocalContactsOut;i++)
+                pVtxIn[i] = pVtxOut[i];
+                
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+    
+}
+
+
+
+
+
+__kernel void   newContactReductionKernel( __global const int2* pairs,
+                                                   __global const BodyData* rigidBodies,
+                                                   __global const float4* separatingNormals,
+                                                   __global const int* hasSeparatingAxis,
+                                                   __global Contact4* globalContactsOut,
+                                                   __global int4* clippingFaces,
+                                                   __global float4* worldVertsB2,
+                                                   volatile __global int* nGlobalContactsOut,
+                                                   int vertexFaceCapacity,
+                                                   int numPairs
+                                                   )
+{
+    int i = get_global_id(0);
+	int pairIndex = i;
+	
+    int4 contactIdx;
+    contactIdx=make_int4(0,1,2,3);
+    
+	if (i<numPairs)
+	{
+        
+		if (hasSeparatingAxis[i])
+		{
+            
+			
+            
+            
+			int nPoints = clippingFaces[pairIndex].w;
+           
+            if (nPoints>0)
+            {
+
+                 __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];
+                float4 normal = -separatingNormals[i];
+                
+                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
+            
+                int dstIdx;
+                AppendInc( nGlobalContactsOut, dstIdx );
+				
+//#if 0
+                
+				if (dstIdx < numPairs)
+				{
+					__global Contact4* c = &globalContactsOut[dstIdx];
+					c->m_worldNormal = normal;
+					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);
+					c->m_batchIdx = pairIndex;
+					int bodyA = pairs[pairIndex].x;
+					int bodyB = pairs[pairIndex].y;
+					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;
+					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;
+                    
+                    switch (nReducedContacts)
+                    {
+                        case 4:
+                            c->m_worldPos[3] = pointsIn[contactIdx.w];
+                        case 3:
+                            c->m_worldPos[2] = pointsIn[contactIdx.z];
+                        case 2:
+                            c->m_worldPos[1] = pointsIn[contactIdx.y];
+                        case 1:
+                            c->m_worldPos[0] = pointsIn[contactIdx.x];
+                        default:
+                        {
+                        }
+                    };
+                    
+					GET_NPOINTS(*c) = nReducedContacts;
+                    
+                 }
+                 
+                
+//#endif
+				
+			}//		if (numContactsOut>0)
+		}//		if (hasSeparatingAxis[i])
+	}//	if (i<numPairs)
+
+    
+    
+}
diff --git a/opencl/gpu_sat/kernels/satClipHullContacts.h b/opencl/gpu_sat/kernels/satClipHullContacts.h
new file mode 100644
index 000000000..099001db3
--- /dev/null
+++ b/opencl/gpu_sat/kernels/satClipHullContacts.h
@@ -0,0 +1,2043 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satClipKernelsCL= \
+"\n"
+"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+"\n"
+"#define SHAPE_PLANE 4\n"
+"#define SHAPE_SPHERE 7\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"#define counter32_t volatile __global int*\n"
+"#endif\n"
+"\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"\n"
+"typedef unsigned int u32;\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_worldNormal;	//	w: m_nPoints\n"
+"//	float m_restituitionCoeff;\n"
+"//	float m_frictionCoeff;\n"
+"	u32 m_coeffs;\n"
+"	u32 m_batchIdx;\n"
+"//	int m_nPoints;\n"
+"//	int m_padding0;\n"
+"\n"
+"	int m_bodyAPtrAndSignBit;//x:m_bodyAPtr, y:m_bodyBPtr\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"} Contact4;\n"
+"\n"
+"\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"{\n"
+"	int m_numChildShapes;\n"
+"	float m_radius;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"\n"
+"#define GET_NPOINTS(x) (x).m_worldNormal.w\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"\n"
+"	u32 m_collidableIdx;	\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"\n"
+"\n"
+"typedef struct  \n"
+"{\n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"	\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"\n"
+"} ConvexPolyhedronCL;\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"\n"
+"__inline\n"
+"float fastDiv(float numerator, float denominator)\n"
+"{\n"
+"	return native_divide(numerator, denominator);	\n"
+"//	return numerator/denominator;	\n"
+"}\n"
+"\n"
+"__inline\n"
+"float4 fastDiv4(float4 numerator, float4 denominator)\n"
+"{\n"
+"	return native_divide(numerator, denominator);	\n"
+"}\n"
+"\n"
+"\n"
+"__inline\n"
+"float4 cross3(float4 a, float4 b)\n"
+"{\n"
+"	return cross(a,b);\n"
+"}\n"
+"\n"
+"//#define dot3F4 dot\n"
+"\n"
+"__inline\n"
+"float dot3F4(float4 a, float4 b)\n"
+"{\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"}\n"
+"\n"
+"__inline\n"
+"float4 fastNormalize4(float4 v)\n"
+"{\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"\n"
+"\n"
+"///////////////////////////////////////\n"
+"//	Quaternion\n"
+"///////////////////////////////////////\n"
+"\n"
+"typedef float4 Quaternion;\n"
+"\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"{\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"{\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"}\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"{\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"{\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"}\n"
+"\n"
+"__inline\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"{\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"}\n"
+"\n"
+"__inline\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"{\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"__inline\n"
+"float4 normalize3(const float4 a)\n"
+"{\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"}\n"
+"\n"
+"\n"
+"__inline float4 lerp3(const float4 a,const float4 b, float  t)\n"
+"{\n"
+"	return make_float4(	a.x + (b.x - a.x) * t,\n"
+"						a.y + (b.y - a.y) * t,\n"
+"						a.z + (b.z - a.z) * t,\n"
+"						0.f);\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n"
+"int clipFaceGlobal(__global const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, __global float4* ppVtxOut)\n"
+"{\n"
+"	\n"
+"	int ve;\n"
+"	float ds, de;\n"
+"	int numVertsOut = 0;\n"
+"    //double-check next test\n"
+"    //	if (numVertsIn < 2)\n"
+"    //		return 0;\n"
+"    \n"
+"	float4 firstVertex=pVtxIn[numVertsIn-1];\n"
+"	float4 endVertex = pVtxIn[0];\n"
+"	\n"
+"	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n"
+"    \n"
+"	for (ve = 0; ve < numVertsIn; ve++)\n"
+"	{\n"
+"		endVertex=pVtxIn[ve];\n"
+"		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n"
+"		if (ds<0)\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start < 0, end < 0, so output endVertex\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// Start < 0, end >= 0, so output intersection\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"			}\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start >= 0, end < 0 so output intersection and end\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"		}\n"
+"		firstVertex = endVertex;\n"
+"		ds = de;\n"
+"	}\n"
+"	return numVertsOut;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"// Clips a face to the back of a plane, return the number of vertices out, stored in ppVtxOut\n"
+"int clipFace(const float4* pVtxIn, int numVertsIn, float4 planeNormalWS,float planeEqWS, float4* ppVtxOut)\n"
+"{\n"
+"	\n"
+"	int ve;\n"
+"	float ds, de;\n"
+"	int numVertsOut = 0;\n"
+"//double-check next test\n"
+"//	if (numVertsIn < 2)\n"
+"//		return 0;\n"
+"\n"
+"	float4 firstVertex=pVtxIn[numVertsIn-1];\n"
+"	float4 endVertex = pVtxIn[0];\n"
+"	\n"
+"	ds = dot3F4(planeNormalWS,firstVertex)+planeEqWS;\n"
+"\n"
+"	for (ve = 0; ve < numVertsIn; ve++)\n"
+"	{\n"
+"		endVertex=pVtxIn[ve];\n"
+"\n"
+"		de = dot3F4(planeNormalWS,endVertex)+planeEqWS;\n"
+"\n"
+"		if (ds<0)\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start < 0, end < 0, so output endVertex\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// Start < 0, end >= 0, so output intersection\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"			}\n"
+"		}\n"
+"		else\n"
+"		{\n"
+"			if (de<0)\n"
+"			{\n"
+"				// Start >= 0, end < 0 so output intersection and end\n"
+"				ppVtxOut[numVertsOut++] = lerp3(firstVertex, endVertex,(ds * 1.f/(ds - de)) );\n"
+"				ppVtxOut[numVertsOut++] = endVertex;\n"
+"			}\n"
+"		}\n"
+"		firstVertex = endVertex;\n"
+"		ds = de;\n"
+"	}\n"
+"	return numVertsOut;\n"
+"}\n"
+"\n"
+"\n"
+"int clipFaceAgainstHull(const float4 separatingNormal, __global const ConvexPolyhedronCL* hullA,  \n"
+"	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n"
+"	float4* worldVertsB2, int capacityWorldVertsB2,\n"
+"	const float minDist, float maxDist,\n"
+"	__global const float4* vertices,\n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int* indices,\n"
+"	float4* contactsOut,\n"
+"	int contactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"\n"
+"	float4* pVtxIn = worldVertsB1;\n"
+"	float4* pVtxOut = worldVertsB2;\n"
+"	\n"
+"	int numVertsIn = numWorldVertsB1;\n"
+"	int numVertsOut = 0;\n"
+"\n"
+"	int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"				faces[hullA->m_faceOffset+face].m_plane.x, \n"
+"				faces[hullA->m_faceOffset+face].m_plane.y, \n"
+"				faces[hullA->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"		\n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"\n"
+"	btGpuFace polyA = faces[hullA->m_faceOffset+closestFaceA];\n"
+"\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"	int numVerticesA = polyA.m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"		const float4 a = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+e0]];\n"
+"		const float4 b = vertices[hullA->m_vertexOffset+indices[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n"
+"		const float4 edge0 = a - b;\n"
+"		const float4 WorldEdge0 = qtRotate(ornA,edge0);\n"
+"		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n"
+"\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = transform(&a,&posA,&ornA);\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		\n"
+"		//clip face\n"
+"		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
+"		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
+"\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
+"		float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsIn = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"\n"
+"	\n"
+"	// only keep points that are behind the witness face\n"
+"	{\n"
+"		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float localPlaneEq = polyA.m_plane.w;\n"
+"		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n"
+"		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n"
+"		for (int i=0;i<numVertsIn;i++)\n"
+"		{\n"
+"			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"			if (depth <=minDist)\n"
+"			{\n"
+"				depth = minDist;\n"
+"			}\n"
+"\n"
+"			if (depth <=maxDist)\n"
+"			{\n"
+"				float4 pointInWorld = pVtxIn[i];\n"
+"				//resultOut.addContactPoint(separatingNormal,point,depth);\n"
+"				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	return numContactsOut;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"int clipFaceAgainstHullLocalA(const float4 separatingNormal, const ConvexPolyhedronCL* hullA,  \n"
+"	const float4 posA, const Quaternion ornA, float4* worldVertsB1, int numWorldVertsB1,\n"
+"	float4* worldVertsB2, int capacityWorldVertsB2,\n"
+"	const float minDist, float maxDist,\n"
+"	const float4* verticesA,\n"
+"	const btGpuFace* facesA,\n"
+"	const int* indicesA,\n"
+"	__global const float4* verticesB,\n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int* indicesB,\n"
+"	float4* contactsOut,\n"
+"	int contactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"\n"
+"	float4* pVtxIn = worldVertsB1;\n"
+"	float4* pVtxOut = worldVertsB2;\n"
+"	\n"
+"	int numVertsIn = numWorldVertsB1;\n"
+"	int numVertsOut = 0;\n"
+"\n"
+"	int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"				facesA[hullA->m_faceOffset+face].m_plane.x, \n"
+"				facesA[hullA->m_faceOffset+face].m_plane.y, \n"
+"				facesA[hullA->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"		\n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"\n"
+"	btGpuFace polyA = facesA[hullA->m_faceOffset+closestFaceA];\n"
+"\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"	int numVerticesA = polyA.m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"		const float4 a = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+e0]];\n"
+"		const float4 b = verticesA[hullA->m_vertexOffset+indicesA[polyA.m_indexOffset+((e0+1)%numVerticesA)]];\n"
+"		const float4 edge0 = a - b;\n"
+"		const float4 WorldEdge0 = qtRotate(ornA,edge0);\n"
+"		float4 planeNormalA = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float4 worldPlaneAnormal1 = qtRotate(ornA,planeNormalA);\n"
+"\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = transform(&a,&posA,&ornA);\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		\n"
+"		//clip face\n"
+"		//clipFace(*pVtxIn, *pVtxOut,planeNormalWS,planeEqWS);\n"
+"		numVertsOut = clipFace(pVtxIn, numVertsIn, planeNormalWS,planeEqWS, pVtxOut);\n"
+"\n"
+"		//btSwap(pVtxIn,pVtxOut);\n"
+"		float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsIn = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"\n"
+"	\n"
+"	// only keep points that are behind the witness face\n"
+"	{\n"
+"		float4 localPlaneNormal  = make_float4(polyA.m_plane.x,polyA.m_plane.y,polyA.m_plane.z,0.f);\n"
+"		float localPlaneEq = polyA.m_plane.w;\n"
+"		float4 planeNormalWS = qtRotate(ornA,localPlaneNormal);\n"
+"		float planeEqWS=localPlaneEq-dot3F4(planeNormalWS,posA);\n"
+"		for (int i=0;i<numVertsIn;i++)\n"
+"		{\n"
+"			float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"			if (depth <=minDist)\n"
+"			{\n"
+"				depth = minDist;\n"
+"			}\n"
+"\n"
+"			if (depth <=maxDist)\n"
+"			{\n"
+"				float4 pointInWorld = pVtxIn[i];\n"
+"				//resultOut.addContactPoint(separatingNormal,point,depth);\n"
+"				contactsOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	return numContactsOut;\n"
+"}\n"
+"\n"
+"int	clipHullAgainstHull(const float4 separatingNormal,\n"
+"	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n"
+"	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
+"	const float minDist, float maxDist,\n"
+"	__global const float4* vertices,\n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int* indices,\n"
+"	float4*	localContactsOut,\n"
+"	int localContactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"\n"
+"\n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"\n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x, \n"
+"				faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	{\n"
+"		const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"\n"
+"	if (closestFaceB>=0)\n"
+"	{\n"
+"		numContactsOut = clipFaceAgainstHull(separatingNormal, hullA, \n"
+"				posA,ornA,\n"
+"				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,vertices,\n"
+"				faces,\n"
+"				indices,localContactsOut,localContactCapacity);\n"
+"	}\n"
+"\n"
+"	return numContactsOut;\n"
+"}\n"
+"\n"
+"\n"
+"int	clipHullAgainstHullLocalA(const float4 separatingNormal,\n"
+"	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB, \n"
+"	float4* worldVertsB1, float4* worldVertsB2, int capacityWorldVerts,\n"
+"	const float minDist, float maxDist,\n"
+"	const float4* verticesA,\n"
+"	const btGpuFace* facesA,\n"
+"	const int* indicesA,\n"
+"	__global const float4* verticesB,\n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int* indicesB,\n"
+"	float4*	localContactsOut,\n"
+"	int localContactCapacity)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"\n"
+"\n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"\n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(facesB[hullB->m_faceOffset+face].m_plane.x, \n"
+"				facesB[hullB->m_faceOffset+face].m_plane.y, facesB[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"	{\n"
+"		const btGpuFace polyB = facesB[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = verticesB[hullB->m_vertexOffset+indicesB[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"\n"
+"	if (closestFaceB>=0)\n"
+"	{\n"
+"		numContactsOut = clipFaceAgainstHullLocalA(separatingNormal, hullA, \n"
+"				posA,ornA,\n"
+"				worldVertsB1,numWorldVertsB1,worldVertsB2,capacityWorldVerts, minDist, maxDist,\n"
+"				verticesA,facesA,indicesA,\n"
+"				verticesB,facesB,indicesB,\n"
+"				localContactsOut,localContactCapacity);\n"
+"	}\n"
+"\n"
+"	return numContactsOut;\n"
+"}\n"
+"\n"
+"#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];\n"
+"#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}\n"
+"#define REDUCE_MAX(v, n) {int i=0;"
+"for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }\n"
+"#define REDUCE_MIN(v, n) {int i=0;"
+"for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }\n"
+"\n"
+"int extractManifoldSequentialGlobal(__global const float4* p, int nPoints, float4 nearNormal, int4* contactIdx)\n"
+"{\n"
+"	if( nPoints == 0 )\n"
+"        return 0;\n"
+"    \n"
+"    if (nPoints <=4)\n"
+"        return nPoints;\n"
+"    \n"
+"    \n"
+"    if (nPoints >64)\n"
+"        nPoints = 64;\n"
+"    \n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			center += p[i];\n"
+"		center /= (float)nPoints;\n"
+"	}\n"
+"    \n"
+"	\n"
+"    \n"
+"	//	sample 4 directions\n"
+"    \n"
+"    float4 aVector = p[0] - center;\n"
+"    float4 u = cross3( nearNormal, aVector );\n"
+"    float4 v = cross3( nearNormal, u );\n"
+"    u = normalize3( u );\n"
+"    v = normalize3( v );\n"
+"    \n"
+"    \n"
+"    //keep point with deepest penetration\n"
+"    float minW= FLT_MAX;\n"
+"    \n"
+"    int minIndex=-1;\n"
+"    \n"
+"    float4 maxDots;\n"
+"    maxDots.x = FLT_MIN;\n"
+"    maxDots.y = FLT_MIN;\n"
+"    maxDots.z = FLT_MIN;\n"
+"    maxDots.w = FLT_MIN;\n"
+"    \n"
+"    //	idx, distance\n"
+"    for(int ie = 0; ie<nPoints; ie++ )\n"
+"    {\n"
+"        if (p[ie].w<minW)\n"
+"        {\n"
+"            minW = p[ie].w;\n"
+"            minIndex=ie;\n"
+"        }\n"
+"        float f;\n"
+"        float4 r = p[ie]-center;\n"
+"        f = dot3F4( u, r );\n"
+"        if (f<maxDots.x)\n"
+"        {\n"
+"            maxDots.x = f;\n"
+"            contactIdx[0].x = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -u, r );\n"
+"        if (f<maxDots.y)\n"
+"        {\n"
+"            maxDots.y = f;\n"
+"            contactIdx[0].y = ie;\n"
+"        }\n"
+"        \n"
+"        \n"
+"        f = dot3F4( v, r );\n"
+"        if (f<maxDots.z)\n"
+"        {\n"
+"            maxDots.z = f;\n"
+"            contactIdx[0].z = ie;\n"
+"        }\n"
+"        \n"
+"        f = dot3F4( -v, r );\n"
+"        if (f<maxDots.w)\n"
+"        {\n"
+"            maxDots.w = f;\n"
+"            contactIdx[0].w = ie;\n"
+"        }\n"
+"        \n"
+"    }\n"
+"    \n"
+"    if (contactIdx[0].x != minIndex && contactIdx[0].y != minIndex && contactIdx[0].z != minIndex && contactIdx[0].w != minIndex)\n"
+"    {\n"
+"        //replace the first contact with minimum (todo: replace contact with least penetration)\n"
+"        contactIdx[0].x = minIndex;\n"
+"    }\n"
+"    \n"
+"    return 4;\n"
+"    \n"
+"}\n"
+"\n"
+"\n"
+"int extractManifoldSequentialGlobalFake(__global const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n"
+"{\n"
+"    contactIdx[0] = 0;\n"
+"    contactIdx[1] = 1;\n"
+"    contactIdx[2] = 2;\n"
+"    contactIdx[3] = 3;\n"
+"    \n"
+"	if( nPoints == 0 ) return 0;\n"
+"    \n"
+"	nPoints = min2( nPoints, 4 );\n"
+"    return nPoints;\n"
+"    \n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"int extractManifoldSequential(const float4* p, int nPoints, float4 nearNormal, int* contactIdx)\n"
+"{\n"
+"	if( nPoints == 0 ) return 0;\n"
+"\n"
+"	nPoints = min2( nPoints, 64 );\n"
+"\n"
+"	float4 center = make_float4(0.f);\n"
+"	{\n"
+"		float4 v[64];\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"			v[i] = p[i];\n"
+"		//memcpy( v, p, nPoints*sizeof(float4) );\n"
+"		PARALLEL_SUM( v, nPoints );\n"
+"		center = v[0]/(float)nPoints;\n"
+"	}\n"
+"\n"
+"	\n"
+"\n"
+"	{	//	sample 4 directions\n"
+"		if( nPoints < 4 )\n"
+"		{\n"
+"			for(int i=0; i<nPoints; i++) \n"
+"				contactIdx[i] = i;\n"
+"			return nPoints;\n"
+"		}\n"
+"\n"
+"		float4 aVector = p[0] - center;\n"
+"		float4 u = cross3( nearNormal, aVector );\n"
+"		float4 v = cross3( nearNormal, u );\n"
+"		u = normalize3( u );\n"
+"		v = normalize3( v );\n"
+"\n"
+"		int idx[4];\n"
+"\n"
+"		float2 max00 = make_float2(0,FLT_MAX);\n"
+"		{\n"
+"			//	idx, distance\n"
+"			{\n"
+"				{\n"
+"					int4 a[64];\n"
+"					for(int ie = 0; ie<nPoints; ie++ )\n"
+"					{\n"
+"						\n"
+"						\n"
+"						float f;\n"
+"						float4 r = p[ie]-center;\n"
+"						f = dot3F4( u, r );\n"
+"						a[ie].x = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"\n"
+"						f = dot3F4( -u, r );\n"
+"						a[ie].y = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"\n"
+"						f = dot3F4( v, r );\n"
+"						a[ie].z = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"\n"
+"						f = dot3F4( -v, r );\n"
+"						a[ie].w = ((*(u32*)&f) & 0xffffff00) | (0xff & ie);\n"
+"					}\n"
+"\n"
+"					for(int ie=0; ie<nPoints; ie++)\n"
+"					{\n"
+"						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;\n"
+"						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;\n"
+"						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;\n"
+"						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;\n"
+"					}\n"
+"\n"
+"					idx[0] = (int)a[0].x & 0xff;\n"
+"					idx[1] = (int)a[0].y & 0xff;\n"
+"					idx[2] = (int)a[0].z & 0xff;\n"
+"					idx[3] = (int)a[0].w & 0xff;\n"
+"				}\n"
+"			}\n"
+"\n"
+"			{\n"
+"				float2 h[64];\n"
+"				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );\n"
+"				REDUCE_MIN( h, nPoints );\n"
+"				max00 = h[0];\n"
+"			}\n"
+"		}\n"
+"\n"
+"		contactIdx[0] = idx[0];\n"
+"		contactIdx[1] = idx[1];\n"
+"		contactIdx[2] = idx[2];\n"
+"		contactIdx[3] = idx[3];\n"
+"\n"
+"//		if( max00.y < 0.0f )\n"
+"//			contactIdx[0] = (int)max00.x;\n"
+"\n"
+"		//does this sort happen on GPU too?\n"
+"		//std::sort( contactIdx, contactIdx+4 );\n"
+"\n"
+"		return 4;\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   extractManifoldAndAddContactKernel(__global const int2* pairs, \n"
+"																	__global const BodyData* rigidBodies, \n"
+"																	__global const float4* closestPointsWorld,\n"
+"																	__global const float4* separatingNormalsWorld,\n"
+"																	__global const int* contactCounts,\n"
+"																	__global const int* contactOffsets,\n"
+"																	__global Contact4* restrict contactsOut,\n"
+"																	counter32_t nContactsOut,\n"
+"																	int numPairs,\n"
+"																	int pairIndex\n"
+"																	)\n"
+"{\n"
+"	int idx = get_global_id(0);\n"
+"	\n"
+"	if (idx<numPairs)\n"
+"	{\n"
+"		float4 normal = separatingNormalsWorld[idx];\n"
+"		int nPoints = contactCounts[idx];\n"
+"		__global const float4* pointsIn = &closestPointsWorld[contactOffsets[idx]];\n"
+"		float4 localPoints[64];\n"
+"		for (int i=0;i<nPoints;i++)\n"
+"		{\n"
+"			localPoints[i] = pointsIn[i];\n"
+"		}\n"
+"//		int contactIdx[4] = {-1,-1,-1,-1};\n"
+"		int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"		contactIdx[0] = -1;\n"
+"		contactIdx[1] = -1;\n"
+"		contactIdx[2] = -1;\n"
+"		contactIdx[3] = -1;\n"
+"\n"
+"		int nContacts = extractManifoldSequential(localPoints, nPoints, normal, contactIdx);\n"
+"\n"
+"		int dstIdx;\n"
+"		AppendInc( nContactsOut, dstIdx );\n"
+"		//if ((dstIdx+nContacts) < capacity)\n"
+"		{\n"
+"			__global Contact4* c = contactsOut + dstIdx;\n"
+"			c->m_worldNormal = normal;\n"
+"			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"			c->m_batchIdx = idx;\n"
+"			int bodyA = pairs[pairIndex].x;\n"
+"			int bodyB = pairs[pairIndex].y;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0 ? -bodyA:bodyA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0 ? -bodyB:bodyB;\n"
+"			for (int i=0;i<nContacts;i++)\n"
+"			{\n"
+"				c->m_worldPos[i] = localPoints[contactIdx[i]];\n"
+"			}\n"
+"			GET_NPOINTS(*c) = nContacts;\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"void	trInverse(float4 translationIn, Quaternion orientationIn,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"{\n"
+"	*orientationOut = qtInvert(orientationIn);\n"
+"	*translationOut = qtRotate(*orientationOut, -translationIn);\n"
+"}\n"
+"\n"
+"void	trMul(float4 translationA, Quaternion orientationA,\n"
+"						float4 translationB, Quaternion orientationB,\n"
+"		float4* translationOut, Quaternion* orientationOut)\n"
+"{\n"
+"	*orientationOut = qtMul(orientationA,orientationB);\n"
+"	*translationOut = transform(&translationB,&translationA,&orientationA);\n"
+"}\n"
+"\n"
+"void	computeContactPlaneConvex(int pairIndex,\n"
+"																int bodyIndexA, int bodyIndexB, \n"
+"																int collidableIndexA, int collidableIndexB, \n"
+"																__global const BodyData* rigidBodies, \n"
+"																__global const btCollidableGpu* collidables,\n"
+"																__global const btGpuFace* faces,\n"
+"																__global Contact4* restrict globalContactsOut,\n"
+"																counter32_t nGlobalContactsOut,\n"
+"																int numPairs)\n"
+"{\n"
+"	float4 planeEq = faces[collidables[collidableIndexA].m_shapeIndex].m_plane;\n"
+"	float radius = collidables[collidableIndexB].m_radius;\n"
+"	float4 posA1 = rigidBodies[bodyIndexA].m_pos;\n"
+"	float4 ornA1 = rigidBodies[bodyIndexA].m_quat;\n"
+"	float4 posB1 = rigidBodies[bodyIndexB].m_pos;\n"
+"	float4 ornB1 = rigidBodies[bodyIndexB].m_quat;\n"
+"	\n"
+"	bool hasCollision = false;\n"
+"	float4 planeNormal1 = make_float4(planeEq.x,planeEq.y,planeEq.z,0.f);\n"
+"	float planeConstant = planeEq.w;\n"
+"	float4 convexInPlaneTransPos1; Quaternion convexInPlaneTransOrn1;\n"
+"	{\n"
+"		float4 invPosA;Quaternion invOrnA;\n"
+"		trInverse(posA1,ornA1,&invPosA,&invOrnA);\n"
+"		trMul(invPosA,invOrnA,posB1,ornB1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	}\n"
+"	float4 planeInConvexPos1;	Quaternion planeInConvexOrn1;\n"
+"	{\n"
+"		float4 invPosB;Quaternion invOrnB;\n"
+"		trInverse(posB1,ornB1,&invPosB,&invOrnB);\n"
+"		trMul(invPosB,invOrnB,posA1,ornA1,&planeInConvexPos1,&planeInConvexOrn1);	\n"
+"	}\n"
+"	float4 vtx1 = qtRotate(planeInConvexOrn1,-planeNormal1)*radius;\n"
+"	float4 vtxInPlane1 = transform(&vtx1,&convexInPlaneTransPos1,&convexInPlaneTransOrn1);\n"
+"	float distance = dot3F4(planeNormal1,vtxInPlane1) - planeConstant;\n"
+"	hasCollision = distance < 0.f;//m_manifoldPtr->getContactBreakingThreshold();\n"
+"	if (hasCollision)\n"
+"	{\n"
+"		float4 vtxInPlaneProjected1 = vtxInPlane1 -   distance*planeNormal1;\n"
+"		float4 vtxInPlaneWorld1 = transform(&vtxInPlaneProjected1,&posA1,&ornA1);\n"
+"		float4 normalOnSurfaceB1 = qtRotate(ornA1,planeNormal1);\n"
+"		float4 pOnB1 = vtxInPlaneWorld1+normalOnSurfaceB1*distance;\n"
+"		pOnB1.w = distance;\n"
+"\n"
+"		int dstIdx;\n"
+"    AppendInc( nGlobalContactsOut, dstIdx );\n"
+"		\n"
+"		if (dstIdx < numPairs)\n"
+"		{\n"
+"			__global Contact4* c = &globalContactsOut[dstIdx];\n"
+"			c->m_worldNormal = normalOnSurfaceB1;\n"
+"			c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"			c->m_batchIdx = pairIndex;\n"
+"			c->m_bodyAPtrAndSignBit = rigidBodies[bodyIndexA].m_invMass==0?-bodyIndexA:bodyIndexA;\n"
+"			c->m_bodyBPtrAndSignBit = rigidBodies[bodyIndexB].m_invMass==0?-bodyIndexB:bodyIndexB;\n"
+"			c->m_worldPos[0] = pOnB1;\n"
+"			GET_NPOINTS(*c) = 1;\n"
+"		}//if (dstIdx < numPairs)\n"
+"	}//if (hasCollision)\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   clipHullHullKernel( __global const int2* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global const int* hasSeparatingAxis,\n"
+"																					__global Contact4* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numPairs)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_PLANE)\n"
+"		{\n"
+"\n"
+"\n"
+"			computeContactPlaneConvex( pairIndex, bodyIndexB,bodyIndexA,  collidableIndexB,collidableIndexA, \n"
+"																rigidBodies,collidables,faces,	globalContactsOut, nGlobalContactsOut,numPairs);\n"
+"			return;\n"
+"		}\n"
+"\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_PLANE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"\n"
+"\n"
+"			computeContactPlaneConvex(pairIndex, bodyIndexA, bodyIndexB, collidableIndexA, collidableIndexB, \n"
+"																rigidBodies,collidables,faces, globalContactsOut, nGlobalContactsOut,numPairs);\n"
+"			return;\n"
+"			\n"
+"		}\n"
+"	\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			//sphere-sphere\n"
+"			float radiusA = collidables[collidableIndexA].m_radius;\n"
+"			float radiusB = collidables[collidableIndexB].m_radius;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"\n"
+"			float4 diff = posA-posB;\n"
+"			float len = length(diff);\n"
+"			\n"
+"			///iff distance positive, don't generate a new contact\n"
+"			if ( len <= (radiusA+radiusB))\n"
+"			{\n"
+"				///distance (negative means penetration)\n"
+"				float dist = len - (radiusA+radiusB);\n"
+"				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n"
+"				if (len > 0.00001)\n"
+"				{\n"
+"					normalOnSurfaceB = diff / len;\n"
+"				}\n"
+"				float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n"
+"				contactPosB.w = dist;\n"
+"								\n"
+"				int dstIdx;\n"
+"        AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				\n"
+"				if (dstIdx < numPairs)\n"
+"				{\n"
+"					__global Contact4* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormal = -normalOnSurfaceB;\n"
+"					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_worldPos[0] = contactPosB;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				}//if (dstIdx < numPairs)\n"
+"			}//if ( len <= (radiusA+radiusB))\n"
+"\n"
+"			return;\n"
+"		}//SHAPE_SPHERE SHAPE_SPHERE\n"
+"\n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"\n"
+"\n"
+"		\n"
+"			int numLocalContactsOut = clipHullAgainstHull(separatingNormals[i],\n"
+"														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"				float4 normal = -separatingNormals[i];\n"
+"				int nPoints = numLocalContactsOut;\n"
+"				float4* pointsIn = localContactsOut;\n"
+"				int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"\n"
+"				contactIdx[0] = -1;\n"
+"				contactIdx[1] = -1;\n"
+"				contactIdx[2] = -1;\n"
+"				contactIdx[3] = -1;\n"
+"		\n"
+"				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"		\n"
+"				int dstIdx;\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				//if ((dstIdx+nReducedContacts) < capacity)\n"
+"				{\n"
+"					__global Contact4* c = globalContactsOut+ dstIdx;\n"
+"					c->m_worldNormal = normal;\n"
+"					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"\n"
+"					for (int i=0;i<nReducedContacts;i++)\n"
+"					{\n"
+"						c->m_worldPos[i] = pointsIn[contactIdx[i]];\n"
+"					}\n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"				}\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   clipCompoundsHullHullKernel( __global const int4* gpuCompoundPairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
+"																					__global const float4* gpuCompoundSepNormalsOut,\n"
+"																					__global const int* gpuHasCompoundSepNormalsOut,\n"
+"																					__global Contact4* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numCompoundPairs)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"\n"
+"		if (gpuHasCompoundSepNormalsOut[i])\n"
+"		{\n"
+"\n"
+"			int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"			int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"			\n"
+"			int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"			int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"			\n"
+"			int collidableIndexA = -1;\n"
+"			int collidableIndexB = -1;\n"
+"			\n"
+"			float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			\n"
+"			float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"								\n"
+"			if (childShapeIndexA >= 0)\n"
+"			{\n"
+"				collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"				float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"				float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"				float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"				float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"				posA = newPosA;\n"
+"				ornA = newOrnA;\n"
+"			} else\n"
+"			{\n"
+"				collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			}\n"
+"			\n"
+"			if (childShapeIndexB>=0)\n"
+"			{\n"
+"				collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"				float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"				float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"				float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"				float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"				posB = newPosB;\n"
+"				ornB = newOrnB;\n"
+"			} else\n"
+"			{\n"
+"				collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"			}\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"\n"
+"			\n"
+"\n"
+"\n"
+"		\n"
+"			int numLocalContactsOut = clipHullAgainstHull(gpuCompoundSepNormalsOut[i],\n"
+"														&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"														posA,ornA,\n"
+"													  posB,ornB,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"				float4 normal = -gpuCompoundSepNormalsOut[i];\n"
+"				int nPoints = numLocalContactsOut;\n"
+"				float4* pointsIn = localContactsOut;\n"
+"				int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"\n"
+"				contactIdx[0] = -1;\n"
+"				contactIdx[1] = -1;\n"
+"				contactIdx[2] = -1;\n"
+"				contactIdx[3] = -1;\n"
+"		\n"
+"				int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"		\n"
+"				int dstIdx;\n"
+"				AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				//if ((dstIdx+nReducedContacts) < capacity)\n"
+"				{\n"
+"					__global Contact4* c = globalContactsOut+ dstIdx;\n"
+"					c->m_worldNormal = normal;\n"
+"					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = gpuCompoundPairs[pairIndex].x;\n"
+"					int bodyB = gpuCompoundPairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"\n"
+"					for (int i=0;i<nReducedContacts;i++)\n"
+"					{\n"
+"						c->m_worldPos[i] = pointsIn[contactIdx[i]];\n"
+"					}\n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"				}\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (gpuHasCompoundSepNormalsOut[i])\n"
+"	}//	if (i<numCompoundPairs)\n"
+"\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   sphereSphereCollisionKernel( __global const int2* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global const int* hasSeparatingAxis,\n"
+"																					__global Contact4* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numPairs)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"\n"
+"		if (collidables[collidableIndexA].m_shapeType == SHAPE_SPHERE &&\n"
+"			collidables[collidableIndexB].m_shapeType == SHAPE_SPHERE)\n"
+"		{\n"
+"			//sphere-sphere\n"
+"			float radiusA = collidables[collidableIndexA].m_radius;\n"
+"			float radiusB = collidables[collidableIndexB].m_radius;\n"
+"			float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"			float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"\n"
+"			float4 diff = posA-posB;\n"
+"			float len = length(diff);\n"
+"			\n"
+"			///iff distance positive, don't generate a new contact\n"
+"			if ( len <= (radiusA+radiusB))\n"
+"			{\n"
+"				///distance (negative means penetration)\n"
+"				float dist = len - (radiusA+radiusB);\n"
+"				float4 normalOnSurfaceB = make_float4(1.f,0.f,0.f,0.f);\n"
+"				if (len > 0.00001)\n"
+"				{\n"
+"					normalOnSurfaceB = diff / len;\n"
+"				}\n"
+"				float4 contactPosB = posB + normalOnSurfaceB*radiusB;\n"
+"				contactPosB.w = dist;\n"
+"								\n"
+"				int dstIdx;\n"
+"        AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				\n"
+"				if (dstIdx < numPairs)\n"
+"				{\n"
+"					__global Contact4* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormal = normalOnSurfaceB;\n"
+"					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"					c->m_worldPos[0] = contactPosB;\n"
+"					GET_NPOINTS(*c) = 1;\n"
+"				}//if (dstIdx < numPairs)\n"
+"			}//if ( len <= (radiusA+radiusB))\n"
+"		}//SHAPE_SPHERE SHAPE_SPHERE\n"
+"	}//if (i<numPairs)\n"
+"}				\n"
+"\n"
+"__kernel void   clipHullHullConcaveConvexKernel( __global int4* concavePairsIn,\n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global const float4* separatingNormals,\n"
+"																					__global Contact4* restrict globalContactsOut,\n"
+"																					counter32_t nGlobalContactsOut,\n"
+"																					int numConcavePairs)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"	float4 worldVertsB1[64];\n"
+"	float4 worldVertsB2[64];\n"
+"	int capacityWorldVerts = 64;	\n"
+"\n"
+"	float4 localContactsOut[64];\n"
+"	int localContactCapacity=64;\n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"\n"
+"	if (i<numConcavePairs)\n"
+"	{\n"
+"\n"
+"		int bodyIndexA = concavePairsIn[i].x;\n"
+"		int bodyIndexB = concavePairsIn[i].y;\n"
+"		int f = concavePairsIn[i].z;\n"
+"		\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"		\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		///////////////////////////////////////////////////////////////\n"
+"		\n"
+"	\n"
+"		bool overlap = false;\n"
+"		\n"
+"		ConvexPolyhedronCL convexPolyhedronA;\n"
+"\n"
+"	//add 3 vertices of the triangle\n"
+"		convexPolyhedronA.m_numVertices = 3;\n"
+"		convexPolyhedronA.m_vertexOffset = 0;\n"
+"		float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"\n"
+"		btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"		\n"
+"		float4 verticesA[3];\n"
+"		for (int i=0;i<3;i++)\n"
+"		{\n"
+"			int index = indices[face.m_indexOffset+i];\n"
+"			float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"			verticesA[i] = vert;\n"
+"			localCenter += vert;\n"
+"		}\n"
+"\n"
+"		float dmin = FLT_MAX;\n"
+"\n"
+"		int localCC=0;\n"
+"\n"
+"		//a triangle has 3 unique edges\n"
+"		convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"		convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"		float4 uniqueEdgesA[3];\n"
+"		\n"
+"		uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"		uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"		uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"\n"
+"\n"
+"		convexPolyhedronA.m_faceOffset = 0;\n"
+"                                  \n"
+"		float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"                             \n"
+"		btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"		int indicesA[3+3+2+2+2];\n"
+"		int curUsedIndices=0;\n"
+"		int fidx=0;\n"
+"\n"
+"		//front size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[0] = 0;\n"
+"			indicesA[1] = 1;\n"
+"			indicesA[2] = 2;\n"
+"			curUsedIndices+=3;\n"
+"			float c = face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = normal.x;\n"
+"			facesA[fidx].m_plane.y = normal.y;\n"
+"			facesA[fidx].m_plane.z = normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"		//back size of triangle\n"
+"		{\n"
+"			facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"			indicesA[3]=2;\n"
+"			indicesA[4]=1;\n"
+"			indicesA[5]=0;\n"
+"			curUsedIndices+=3;\n"
+"			float c = dot3F4(normal,verticesA[0]);\n"
+"			float c1 = -face.m_plane.w;\n"
+"			facesA[fidx].m_plane.x = -normal.x;\n"
+"			facesA[fidx].m_plane.y = -normal.y;\n"
+"			facesA[fidx].m_plane.z = -normal.z;\n"
+"			facesA[fidx].m_plane.w = c;\n"
+"			facesA[fidx].m_numIndices=3;\n"
+"		}\n"
+"		fidx++;\n"
+"\n"
+"		bool addEdgePlanes = true;\n"
+"		if (addEdgePlanes)\n"
+"		{\n"
+"			int numVertices=3;\n"
+"			int prevVertex = numVertices-1;\n"
+"			for (int i=0;i<numVertices;i++)\n"
+"			{\n"
+"				float4 v0 = verticesA[i];\n"
+"				float4 v1 = verticesA[prevVertex];\n"
+"                                            \n"
+"				float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"				float c = -dot3F4(edgeNormal,v0);\n"
+"\n"
+"				facesA[fidx].m_numIndices = 2;\n"
+"				facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"				indicesA[curUsedIndices++]=i;\n"
+"				indicesA[curUsedIndices++]=prevVertex;\n"
+"                                            \n"
+"				facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"				facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"				facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"				facesA[fidx].m_plane.w = c;\n"
+"				fidx++;\n"
+"				prevVertex = i;\n"
+"			}\n"
+"		}\n"
+"		convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"		convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"\n"
+"\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"\n"
+"		float4 sepAxis = separatingNormals[i];\n"
+"		\n"
+"		\n"
+"		////////////////////////////////////////\n"
+"		\n"
+"		\n"
+"		\n"
+"		int numLocalContactsOut = clipHullAgainstHullLocalA(sepAxis,\n"
+"														&convexPolyhedronA, &convexShapes[shapeIndexB],\n"
+"														rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"													  rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"													  worldVertsB1,worldVertsB2,capacityWorldVerts,\n"
+"														minDist, maxDist,\n"
+"														&verticesA,&facesA,&indicesA,\n"
+"														vertices,faces,indices,\n"
+"														localContactsOut,localContactCapacity);\n"
+"												\n"
+"		if (numLocalContactsOut>0)\n"
+"		{\n"
+"			float4 normal = -separatingNormals[i];\n"
+"			int nPoints = numLocalContactsOut;\n"
+"			float4* pointsIn = localContactsOut;\n"
+"			int contactIdx[4];// = {-1,-1,-1,-1};\n"
+"\n"
+"			contactIdx[0] = -1;\n"
+"			contactIdx[1] = -1;\n"
+"			contactIdx[2] = -1;\n"
+"			contactIdx[3] = -1;\n"
+"	\n"
+"			int nReducedContacts = extractManifoldSequential(pointsIn, nPoints, normal, contactIdx);\n"
+"	\n"
+"			int dstIdx;\n"
+"			AppendInc( nGlobalContactsOut, dstIdx );\n"
+"			//if ((dstIdx+nReducedContacts) < capacity)\n"
+"			{\n"
+"				__global Contact4* c = globalContactsOut+ dstIdx;\n"
+"				c->m_worldNormal = normal;\n"
+"				c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"				c->m_batchIdx = pairIndex;\n"
+"				int bodyA = concavePairsIn[pairIndex].x;\n"
+"				int bodyB = concavePairsIn[pairIndex].y;\n"
+"				c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"				c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"\n"
+"				for (int i=0;i<nReducedContacts;i++)\n"
+"				{\n"
+"					c->m_worldPos[i] = pointsIn[contactIdx[i]];\n"
+"				}\n"
+"				GET_NPOINTS(*c) = nReducedContacts;\n"
+"			}\n"
+"				\n"
+"		}//		if (numContactsOut>0)\n"
+"	}//	if (i<numPairs)\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"int	findClippingFaces(const float4 separatingNormal,\n"
+"                      __global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB,\n"
+"                      const float4 posA, const Quaternion ornA,const float4 posB, const Quaternion ornB,\n"
+"                       __global float4* worldVertsA1,\n"
+"                      __global float4* worldNormalsA1,\n"
+"                      __global float4* worldVertsB1,\n"
+"                      int capacityWorldVerts,\n"
+"                      const float minDist, float maxDist,\n"
+"                      __global const float4* vertices,\n"
+"                      __global const btGpuFace* faces,\n"
+"                      __global const int* indices,\n"
+"                      __global int4* clippingFaces, int pairIndex)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"	int numWorldVertsB1= 0;\n"
+"    \n"
+"    \n"
+"	int closestFaceB=-1;\n"
+"	float dmax = -FLT_MAX;\n"
+"    \n"
+"	{\n"
+"		for(int face=0;face<hullB->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(faces[hullB->m_faceOffset+face].m_plane.x,\n"
+"                                              faces[hullB->m_faceOffset+face].m_plane.y, faces[hullB->m_faceOffset+face].m_plane.z,0.f);\n"
+"			const float4 WorldNormal = qtRotate(ornB, Normal);\n"
+"			float d = dot3F4(WorldNormal,separatingNormal);\n"
+"			if (d > dmax)\n"
+"			{\n"
+"				dmax = d;\n"
+"				closestFaceB = face;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"	{\n"
+"		const btGpuFace polyB = faces[hullB->m_faceOffset+closestFaceB];\n"
+"		const int numVertices = polyB.m_numIndices;\n"
+"		for(int e0=0;e0<numVertices;e0++)\n"
+"		{\n"
+"			const float4 b = vertices[hullB->m_vertexOffset+indices[polyB.m_indexOffset+e0]];\n"
+"			worldVertsB1[pairIndex*capacityWorldVerts+numWorldVertsB1++] = transform(&b,&posB,&ornB);\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int closestFaceA=-1;\n"
+"	{\n"
+"		float dmin = FLT_MAX;\n"
+"		for(int face=0;face<hullA->m_numFaces;face++)\n"
+"		{\n"
+"			const float4 Normal = make_float4(\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.x,\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.y,\n"
+"                                              faces[hullA->m_faceOffset+face].m_plane.z,\n"
+"                                              0.f);\n"
+"			const float4 faceANormalWS = qtRotate(ornA,Normal);\n"
+"            \n"
+"			float d = dot3F4(faceANormalWS,separatingNormal);\n"
+"			if (d < dmin)\n"
+"			{\n"
+"				dmin = d;\n"
+"				closestFaceA = face;\n"
+"                worldNormalsA1[pairIndex] = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"    \n"
+"    int numVerticesA = faces[hullA->m_faceOffset+closestFaceA].m_numIndices;\n"
+"	for(int e0=0;e0<numVerticesA;e0++)\n"
+"	{\n"
+"        const float4 a = vertices[hullA->m_vertexOffset+indices[faces[hullA->m_faceOffset+closestFaceA].m_indexOffset+e0]];\n"
+"        worldVertsA1[pairIndex*capacityWorldVerts+e0] = transform(&a, &posA,&ornA);\n"
+"    }\n"
+"    \n"
+"    clippingFaces[pairIndex].x = closestFaceA;\n"
+"    clippingFaces[pairIndex].y = closestFaceB;\n"
+"    clippingFaces[pairIndex].z = numVerticesA;\n"
+"    clippingFaces[pairIndex].w = numWorldVertsB1;\n"
+"    \n"
+"    \n"
+"	return numContactsOut;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"int clipFaces(__global float4* worldVertsA1,\n"
+"              __global float4* worldNormalsA1,\n"
+"              __global float4* worldVertsB1,\n"
+"              __global float4* worldVertsB2, \n"
+"              int capacityWorldVertsB2,\n"
+"              const float minDist, float maxDist,\n"
+"              __global int4* clippingFaces,\n"
+"              int pairIndex)\n"
+"{\n"
+"	int numContactsOut = 0;\n"
+"    \n"
+"    int closestFaceA = clippingFaces[pairIndex].x;\n"
+"    int closestFaceB = clippingFaces[pairIndex].y;\n"
+"	int numVertsInA = clippingFaces[pairIndex].z;\n"
+"	int numVertsInB = clippingFaces[pairIndex].w;\n"
+"    \n"
+"	int numVertsOut = 0;\n"
+"    \n"
+"	if (closestFaceA<0)\n"
+"		return numContactsOut;\n"
+"    \n"
+"    __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n"
+"    __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n"
+"    \n"
+"    \n"
+"	\n"
+"	// clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"    \n"
+"	for(int e0=0;e0<numVertsInA;e0++)\n"
+"	{\n"
+"		const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n"
+"		const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n"
+"		const float4 WorldEdge0 = aw - bw;\n"
+"		float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n"
+"		float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"		float4 worldA1 = aw;\n"
+"		float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"		float4 planeNormalWS = planeNormalWS1;\n"
+"		float planeEqWS=planeEqWS1;\n"
+"		numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n"
+"		__global float4* tmp = pVtxOut;\n"
+"		pVtxOut = pVtxIn;\n"
+"		pVtxIn = tmp;\n"
+"		numVertsInB = numVertsOut;\n"
+"		numVertsOut = 0;\n"
+"	}\n"
+"    \n"
+"    //float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"    //float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"\n"
+"\n"
+"    \n"
+"    /*for (int i=0;i<numVertsInB;i++)\n"
+"    {\n"
+"        pVtxOut[i] = pVtxIn[i];\n"
+"    }*/\n"
+"    \n"
+"    \n"
+"    \n"
+"    \n"
+"    //numVertsInB=0;\n"
+"	\n"
+"    float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"\n"
+"    for (int i=0;i<numVertsInB;i++)\n"
+"    {\n"
+"        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"        if (depth <=minDist)\n"
+"        {\n"
+"            depth = minDist;\n"
+"        }\n"
+"        \n"
+"        if (depth <=maxDist)\n"
+"        {\n"
+"            float4 pointInWorld = pVtxIn[i];\n"
+"            pVtxOut[numContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"        }\n"
+"    }\n"
+"   \n"
+"    clippingFaces[pairIndex].w =numContactsOut;\n"
+"   \n"
+"    \n"
+"	return numContactsOut;\n"
+"\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   findClippingFacesKernel(  __global const int2* pairs,\n"
+"                                        __global const BodyData* rigidBodies,\n"
+"                                        __global const btCollidableGpu* collidables,\n"
+"                                        __global const ConvexPolyhedronCL* convexShapes,\n"
+"                                        __global const float4* vertices,\n"
+"                                        __global const float4* uniqueEdges,\n"
+"                                        __global const btGpuFace* faces,\n"
+"                                        __global const int* indices,\n"
+"                                        __global const float4* separatingNormals,\n"
+"                                        __global const int* hasSeparatingAxis,\n"
+"                                        __global int4* clippingFacesOut,\n"
+"                                        __global float4* worldVertsA1,\n"
+"                                        __global float4* worldNormalsA1,\n"
+"                                        __global float4* worldVertsB1,\n"
+"                                        int capacityWorldVerts,\n"
+"                                        int numPairs\n"
+"                                        )\n"
+"{\n"
+"    \n"
+"	int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"    \n"
+"	\n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"			\n"
+"			int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"			int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"			\n"
+"			int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"			int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"			\n"
+"            \n"
+"            \n"
+"			int numLocalContactsOut = findClippingFaces(separatingNormals[i],\n"
+"                                                        &convexShapes[shapeIndexA], &convexShapes[shapeIndexB],\n"
+"                                                        rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"                                                        rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"                                                        worldVertsA1,\n"
+"                                                        worldNormalsA1,\n"
+"                                                        worldVertsB1,capacityWorldVerts,\n"
+"                                                        minDist, maxDist,\n"
+"                                                        vertices,faces,indices,\n"
+"                                                        clippingFacesOut,i);\n"
+"            \n"
+"            \n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   clipFacesAndContactReductionKernel( __global const int2* pairs,\n"
+"                                                   __global const BodyData* rigidBodies,\n"
+"                                                   __global const float4* separatingNormals,\n"
+"                                                   __global const int* hasSeparatingAxis,\n"
+"                                                     __global Contact4* globalContactsOut,\n"
+"                                                   __global int4* clippingFacesOut,\n"
+"                                                   __global float4* worldVertsA1,\n"
+"                                                   __global float4* worldNormalsA1,\n"
+"                                                   __global float4* worldVertsB1,\n"
+"                                                   __global float4* worldVertsB2,\n"
+"                                                   counter32_t nGlobalContactsOut,\n"
+"                                                    int vertexFaceCapacity,\n"
+"                                                   int numPairs,\n"
+"					                                        int debugMode\n"
+"                                                   )\n"
+"{\n"
+"    int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"    \n"
+"	float minDist = -1e30f;\n"
+"	float maxDist = 0.02f;\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"			int bodyIndexA = pairs[i].x;\n"
+"			int bodyIndexB = pairs[i].y;\n"
+"		    \n"
+"            int numLocalContactsOut = 0;\n"
+"\n"
+"            int capacityWorldVertsB2 = vertexFaceCapacity;\n"
+"            \n"
+"            __global float4* pVtxIn = &worldVertsB1[pairIndex*capacityWorldVertsB2];\n"
+"            __global float4* pVtxOut = &worldVertsB2[pairIndex*capacityWorldVertsB2];\n"
+"            \n"
+"\n"
+"            {\n"
+"                __global int4* clippingFaces = clippingFacesOut;\n"
+"            \n"
+"                \n"
+"                int closestFaceA = clippingFaces[pairIndex].x;\n"
+"                int closestFaceB = clippingFaces[pairIndex].y;\n"
+"                int numVertsInA = clippingFaces[pairIndex].z;\n"
+"                int numVertsInB = clippingFaces[pairIndex].w;\n"
+"                \n"
+"                int numVertsOut = 0;\n"
+"                \n"
+"                if (closestFaceA>=0)\n"
+"                {\n"
+"                    \n"
+"                    \n"
+"                    \n"
+"                    // clip polygon to back of planes of all faces of hull A that are adjacent to witness face\n"
+"                    \n"
+"                    for(int e0=0;e0<numVertsInA;e0++)\n"
+"                    {\n"
+"                        const float4 aw = worldVertsA1[pairIndex*capacityWorldVertsB2+e0];\n"
+"                        const float4 bw = worldVertsA1[pairIndex*capacityWorldVertsB2+((e0+1)%numVertsInA)];\n"
+"                        const float4 WorldEdge0 = aw - bw;\n"
+"                        float4 worldPlaneAnormal1 = worldNormalsA1[pairIndex];\n"
+"                        float4 planeNormalWS1 = -cross3(WorldEdge0,worldPlaneAnormal1);\n"
+"                        float4 worldA1 = aw;\n"
+"                        float planeEqWS1 = -dot3F4(worldA1,planeNormalWS1);\n"
+"                        float4 planeNormalWS = planeNormalWS1;\n"
+"                        float planeEqWS=planeEqWS1;\n"
+"                        numVertsOut = clipFaceGlobal(pVtxIn, numVertsInB, planeNormalWS,planeEqWS, pVtxOut);\n"
+"                        __global float4* tmp = pVtxOut;\n"
+"                        pVtxOut = pVtxIn;\n"
+"                        pVtxIn = tmp;\n"
+"                        numVertsInB = numVertsOut;\n"
+"                        numVertsOut = 0;\n"
+"                    }\n"
+"                    \n"
+"                    float4 planeNormalWS = worldNormalsA1[pairIndex];\n"
+"                    float planeEqWS=-dot3F4(planeNormalWS,worldVertsA1[pairIndex*capacityWorldVertsB2]);\n"
+"                    \n"
+"                    for (int i=0;i<numVertsInB;i++)\n"
+"                    {\n"
+"                        float depth = dot3F4(planeNormalWS,pVtxIn[i])+planeEqWS;\n"
+"                        if (depth <=minDist)\n"
+"                        {\n"
+"                            depth = minDist;\n"
+"                        }\n"
+"                        \n"
+"                        if (depth <=maxDist)\n"
+"                        {\n"
+"                            float4 pointInWorld = pVtxIn[i];\n"
+"                            pVtxOut[numLocalContactsOut++] = make_float4(pointInWorld.x,pointInWorld.y,pointInWorld.z,depth);\n"
+"                        }\n"
+"                    }\n"
+"                    \n"
+"                }\n"
+"                clippingFaces[pairIndex].w =numLocalContactsOut;\n"
+"                \n"
+"\n"
+"            }\n"
+"            \n"
+"            for (int i=0;i<numLocalContactsOut;i++)\n"
+"                pVtxIn[i] = pVtxOut[i];\n"
+"                \n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"    \n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   newContactReductionKernel( __global const int2* pairs,\n"
+"                                                   __global const BodyData* rigidBodies,\n"
+"                                                   __global const float4* separatingNormals,\n"
+"                                                   __global const int* hasSeparatingAxis,\n"
+"                                                   __global Contact4* globalContactsOut,\n"
+"                                                   __global int4* clippingFaces,\n"
+"                                                   __global float4* worldVertsB2,\n"
+"                                                   volatile __global int* nGlobalContactsOut,\n"
+"                                                   int vertexFaceCapacity,\n"
+"                                                   int numPairs\n"
+"                                                   )\n"
+"{\n"
+"    int i = get_global_id(0);\n"
+"	int pairIndex = i;\n"
+"	\n"
+"    int4 contactIdx;\n"
+"    contactIdx=make_int4(0,1,2,3);\n"
+"    \n"
+"	if (i<numPairs)\n"
+"	{\n"
+"        \n"
+"		if (hasSeparatingAxis[i])\n"
+"		{\n"
+"            \n"
+"			\n"
+"            \n"
+"            \n"
+"			int nPoints = clippingFaces[pairIndex].w;\n"
+"           \n"
+"            if (nPoints>0)\n"
+"            {\n"
+"\n"
+"                 __global float4* pointsIn = &worldVertsB2[pairIndex*vertexFaceCapacity];\n"
+"                float4 normal = -separatingNormals[i];\n"
+"                \n"
+"                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);\n"
+"            \n"
+"                int dstIdx;\n"
+"                AppendInc( nGlobalContactsOut, dstIdx );\n"
+"				\n"
+"//#if 0\n"
+"                \n"
+"				if (dstIdx < numPairs)\n"
+"				{\n"
+"					__global Contact4* c = &globalContactsOut[dstIdx];\n"
+"					c->m_worldNormal = normal;\n"
+"					c->m_coeffs = (u32)(0.f*0xffff) | ((u32)(0.7f*0xffff)<<16);\n"
+"					c->m_batchIdx = pairIndex;\n"
+"					int bodyA = pairs[pairIndex].x;\n"
+"					int bodyB = pairs[pairIndex].y;\n"
+"					c->m_bodyAPtrAndSignBit = rigidBodies[bodyA].m_invMass==0?-bodyA:bodyA;\n"
+"					c->m_bodyBPtrAndSignBit = rigidBodies[bodyB].m_invMass==0?-bodyB:bodyB;\n"
+"                    \n"
+"                    switch (nReducedContacts)\n"
+"                    {\n"
+"                        case 4:\n"
+"                            c->m_worldPos[3] = pointsIn[contactIdx.w];\n"
+"                        case 3:\n"
+"                            c->m_worldPos[2] = pointsIn[contactIdx.z];\n"
+"                        case 2:\n"
+"                            c->m_worldPos[1] = pointsIn[contactIdx.y];\n"
+"                        case 1:\n"
+"                            c->m_worldPos[0] = pointsIn[contactIdx.x];\n"
+"                        default:\n"
+"                        {\n"
+"                        }\n"
+"                    };\n"
+"                    \n"
+"					GET_NPOINTS(*c) = nReducedContacts;\n"
+"                    \n"
+"                 }\n"
+"                 \n"
+"                \n"
+"//#endif\n"
+"				\n"
+"			}//		if (numContactsOut>0)\n"
+"		}//		if (hasSeparatingAxis[i])\n"
+"	}//	if (i<numPairs)\n"
+"\n"
+"    \n"
+"    \n"
+"}\n"
+"\n"
+;
diff --git a/opencl/gpu_sat/kernels/satKernels.h b/opencl/gpu_sat/kernels/satKernels.h
new file mode 100644
index 000000000..b40a359b4
--- /dev/null
+++ b/opencl/gpu_sat/kernels/satKernels.h
@@ -0,0 +1,1265 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* satKernelsCL= \
+"//keep this enum in sync with the CPU version (in AdlCollisionShape.h)\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_CONCAVE_TRIMESH 5\n"
+"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+"\n"
+"\n"
+"\n"
+"typedef unsigned int u32;\n"
+"\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"{\n"
+"	int m_numChildShapes;\n"
+"	int blaat2;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"\n"
+"\n"
+"typedef struct  \n"
+"{\n"
+"	float4		m_localCenter;\n"
+"	float4		m_extents;\n"
+"	float4		mC;\n"
+"	float4		mE;\n"
+"	\n"
+"	float			m_radius;\n"
+"	int	m_faceOffset;\n"
+"	int m_numFaces;\n"
+"	int	m_numVertices;\n"
+"\n"
+"	int m_vertexOffset;\n"
+"	int	m_uniqueEdgesOffset;\n"
+"	int	m_numUniqueEdges;\n"
+"	int m_unused;\n"
+"} ConvexPolyhedronCL;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_plane;\n"
+"	int m_indexOffset;\n"
+"	int m_numIndices;\n"
+"} btGpuFace;\n"
+"\n"
+"#define make_float4 (float4)\n"
+"\n"
+"\n"
+"__inline\n"
+"float4 cross3(float4 a, float4 b)\n"
+"{\n"
+"	return cross(a,b);\n"
+"\n"
+"	\n"
+"//	float4 a1 = make_float4(a.xyz,0.f);\n"
+"//	float4 b1 = make_float4(b.xyz,0.f);\n"
+"\n"
+"//	return cross(a1,b1);\n"
+"\n"
+"//float4 c = make_float4(a.y*b.z - a.z*b.y,a.z*b.x - a.x*b.z,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//	float4 c = make_float4(a.y*b.z - a.z*b.y,1.f,a.x*b.y - a.y*b.x,0.f);\n"
+"	\n"
+"	//return c;\n"
+"}\n"
+"\n"
+"__inline\n"
+"float dot3F4(float4 a, float4 b)\n"
+"{\n"
+"	float4 a1 = make_float4(a.xyz,0.f);\n"
+"	float4 b1 = make_float4(b.xyz,0.f);\n"
+"	return dot(a1, b1);\n"
+"}\n"
+"\n"
+"__inline\n"
+"float4 fastNormalize4(float4 v)\n"
+"{\n"
+"	v = make_float4(v.xyz,0.f);\n"
+"	return fast_normalize(v);\n"
+"}\n"
+"\n"
+"\n"
+"///////////////////////////////////////\n"
+"//	Quaternion\n"
+"///////////////////////////////////////\n"
+"\n"
+"typedef float4 Quaternion;\n"
+"\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b);\n"
+"\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in);\n"
+"\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec);\n"
+"\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q);\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__inline\n"
+"Quaternion qtMul(Quaternion a, Quaternion b)\n"
+"{\n"
+"	Quaternion ans;\n"
+"	ans = cross3( a, b );\n"
+"	ans += a.w*b+b.w*a;\n"
+"//	ans.w = a.w*b.w - (a.x*b.x+a.y*b.y+a.z*b.z);\n"
+"	ans.w = a.w*b.w - dot3F4(a, b);\n"
+"	return ans;\n"
+"}\n"
+"\n"
+"__inline\n"
+"Quaternion qtNormalize(Quaternion in)\n"
+"{\n"
+"	return fastNormalize4(in);\n"
+"//	in /= length( in );\n"
+"//	return in;\n"
+"}\n"
+"__inline\n"
+"float4 qtRotate(Quaternion q, float4 vec)\n"
+"{\n"
+"	Quaternion qInv = qtInvert( q );\n"
+"	float4 vcpy = vec;\n"
+"	vcpy.w = 0.f;\n"
+"	float4 out = qtMul(qtMul(q,vcpy),qInv);\n"
+"	return out;\n"
+"}\n"
+"\n"
+"__inline\n"
+"Quaternion qtInvert(Quaternion q)\n"
+"{\n"
+"	return (Quaternion)(-q.xyz, q.w);\n"
+"}\n"
+"\n"
+"__inline\n"
+"float4 qtInvRotate(const Quaternion q, float4 vec)\n"
+"{\n"
+"	return qtRotate( qtInvert( q ), vec );\n"
+"}\n"
+"\n"
+"__inline\n"
+"float4 transform(const float4* p, const float4* translation, const Quaternion* orientation)\n"
+"{\n"
+"	return qtRotate( *orientation, *p ) + (*translation);\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"__inline\n"
+"float4 normalize3(const float4 a)\n"
+"{\n"
+"	float4 n = make_float4(a.x, a.y, a.z, 0.f);\n"
+"	return fastNormalize4( n );\n"
+"}\n"
+"\n"
+"inline void projectLocal(const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, const float4* vertices, float* min, float* max)\n"
+"{\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"}\n"
+"\n"
+"inline void project(__global const ConvexPolyhedronCL* hull,  const float4 pos, const float4 orn, \n"
+"const float4* dir, __global const float4* vertices, float* min, float* max)\n"
+"{\n"
+"	min[0] = FLT_MAX;\n"
+"	max[0] = -FLT_MAX;\n"
+"	int numVerts = hull->m_numVertices;\n"
+"\n"
+"	const float4 localDir = qtInvRotate(orn,*dir);\n"
+"	float offset = dot(pos,*dir);\n"
+"	for(int i=0;i<numVerts;i++)\n"
+"	{\n"
+"		float dp = dot(vertices[hull->m_vertexOffset+i],localDir);\n"
+"		if(dp < min[0])	\n"
+"			min[0] = dp;\n"
+"		if(dp > max[0])	\n"
+"			max[0] = dp;\n"
+"	}\n"
+"	if(min[0]>max[0])\n"
+"	{\n"
+"		float tmp = min[0];\n"
+"		min[0] = max[0];\n"
+"		max[0] = tmp;\n"
+"	}\n"
+"	min[0] += offset;\n"
+"	max[0] += offset;\n"
+"}\n"
+"\n"
+"inline bool TestSepAxisLocalA(const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, const float4* verticesA, __global const float4* verticesB,float* depth)\n"
+"{\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	projectLocal(hullA,posA,ornA,sep_axis,verticesA, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,verticesB, &Min1, &Max1);\n"
+"\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"inline bool IsAlmostZero(const float4 v)\n"
+"{\n"
+"	if(fabs(v.x)>1e-6f || fabs(v.y)>1e-6f || fabs(v.z)>1e-6f)\n"
+"		return false;\n"
+"	return true;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"bool findSeparatingAxisLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, verticesA, verticesB,&d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"\n"
+"bool findSeparatingAxisLocalB(	__global const ConvexPolyhedronCL* hullA,  const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* verticesA, \n"
+"	__global const float4* uniqueEdgesA, \n"
+"	__global const btGpuFace* facesA,\n"
+"	__global const int*  indicesA,\n"
+"	const float4* verticesB,\n"
+"	const float4* uniqueEdgesB, \n"
+"	const btGpuFace* facesB,\n"
+"	const int*  indicesB,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = facesA[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS *= -1.f;\n"
+"			curPlaneTests++;\n"
+"			float d;\n"
+"			if(!TestSepAxisLocalA( hullB, hullA, posB,ornB,posA,ornA, &faceANormalWS, verticesB,verticesA, &d))\n"
+"				return false;\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"bool findSeparatingAxisEdgeEdgeLocalA(	const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	const float4* verticesA, \n"
+"	const float4* uniqueEdgesA, \n"
+"	const btGpuFace* facesA,\n"
+"	const int*  indicesA,\n"
+"	__global const float4* verticesB, \n"
+"	__global const float4* uniqueEdgesB, \n"
+"	__global const btGpuFace* facesB,\n"
+"	__global const int*  indicesB,\n"
+"		float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"\n"
+"	int curPlaneTests=0;\n"
+"\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdgesA[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdgesB[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"\n"
+"\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje *= -1.f;\n"
+"\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					projectLocal(hullA,posA,ornA,&crossje,verticesA, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,verticesB, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						result = false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"\n"
+"				}\n"
+"				\n"
+"\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"\n"
+"	}\n"
+"\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"\n"
+"\n"
+"inline bool TestSepAxis(__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA,const float4 ornA,\n"
+"	const float4 posB,const float4 ornB,\n"
+"	float4* sep_axis, __global const float4* vertices,float* depth)\n"
+"{\n"
+"	float Min0,Max0;\n"
+"	float Min1,Max1;\n"
+"	project(hullA,posA,ornA,sep_axis,vertices, &Min0, &Max0);\n"
+"	project(hullB,posB,ornB, sep_axis,vertices, &Min1, &Max1);\n"
+"\n"
+"	if(Max0<Min1 || Max1<Min0)\n"
+"		return false;\n"
+"\n"
+"	float d0 = Max0 - Min1;\n"
+"	float d1 = Max1 - Min0;\n"
+"	*depth = d0<d1 ? d0:d1;\n"
+"	return true;\n"
+"}\n"
+"\n"
+"\n"
+"bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices, \n"
+"	__global const float4* uniqueEdges, \n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int*  indices,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	\n"
+"	int curPlaneTests=0;\n"
+"\n"
+"	{\n"
+"		int numFacesA = hullA->m_numFaces;\n"
+"		// Test normals from hullA\n"
+"		for(int i=0;i<numFacesA;i++)\n"
+"		{\n"
+"			const float4 normal = faces[hullA->m_faceOffset+i].m_plane;\n"
+"			float4 faceANormalWS = qtRotate(ornA,normal);\n"
+"	\n"
+"			if (dot3F4(DeltaC2,faceANormalWS)<0)\n"
+"				faceANormalWS*=-1.f;\n"
+"				\n"
+"			curPlaneTests++;\n"
+"	\n"
+"			float d;\n"
+"			if(!TestSepAxis( hullA, hullB, posA,ornA,posB,ornB,&faceANormalWS, vertices,&d))\n"
+"				return false;\n"
+"	\n"
+"			if(d<*dmin)\n"
+"			{\n"
+"				*dmin = d;\n"
+"				*sep = faceANormalWS;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"\n"
+"\n"
+"		if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"		{\n"
+"			*sep = -(*sep);\n"
+"		}\n"
+"	\n"
+"	return true;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices, \n"
+"	__global const float4* uniqueEdges, \n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int*  indices,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"\n"
+"	int curPlaneTests=0;\n"
+"\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test edges\n"
+"	for(int e0=0;e0<hullA->m_numUniqueEdges;e0++)\n"
+"	{\n"
+"		const float4 edge0 = uniqueEdges[hullA->m_uniqueEdgesOffset+e0];\n"
+"		float4 edge0World = qtRotate(ornA,edge0);\n"
+"\n"
+"		for(int e1=0;e1<hullB->m_numUniqueEdges;e1++)\n"
+"		{\n"
+"			const float4 edge1 = uniqueEdges[hullB->m_uniqueEdgesOffset+e1];\n"
+"			float4 edge1World = qtRotate(ornB,edge1);\n"
+"\n"
+"\n"
+"			float4 crossje = cross3(edge0World,edge1World);\n"
+"\n"
+"			curEdgeEdge++;\n"
+"			if(!IsAlmostZero(crossje))\n"
+"			{\n"
+"				crossje = normalize3(crossje);\n"
+"				if (dot3F4(DeltaC2,crossje)<0)\n"
+"					crossje*=-1.f;\n"
+"					\n"
+"				float dist;\n"
+"				bool result = true;\n"
+"				{\n"
+"					float Min0,Max0;\n"
+"					float Min1,Max1;\n"
+"					project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"					project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"				\n"
+"					if(Max0<Min1 || Max1<Min0)\n"
+"						result = false;\n"
+"				\n"
+"					float d0 = Max0 - Min1;\n"
+"					float d1 = Max1 - Min0;\n"
+"					dist = d0<d1 ? d0:d1;\n"
+"					result = true;\n"
+"\n"
+"				}\n"
+"				\n"
+"\n"
+"				if(dist<*dmin)\n"
+"				{\n"
+"					*dmin = dist;\n"
+"					*sep = crossje;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"\n"
+"	}\n"
+"\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
+"\n"
+"\n"
+"// work-in-progress\n"
+"__kernel void   processCompoundPairsKernel( __global const int4* gpuCompoundPairs,\n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global const btGpuChildShape* gpuChildShapes,\n"
+"																					__global volatile float4* gpuCompoundSepNormalsOut,\n"
+"																					__global volatile int* gpuHasCompoundSepNormalsOut,\n"
+"																					int numCompoundPairs\n"
+"																					)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"	if (i<numCompoundPairs)\n"
+"	{\n"
+"		int bodyIndexA = gpuCompoundPairs[i].x;\n"
+"		int bodyIndexB = gpuCompoundPairs[i].y;\n"
+"\n"
+"		int childShapeIndexA = gpuCompoundPairs[i].z;\n"
+"		int childShapeIndexB = gpuCompoundPairs[i].w;\n"
+"		\n"
+"		int collidableIndexA = -1;\n"
+"		int collidableIndexB = -1;\n"
+"		\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		\n"
+"		float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							\n"
+"		if (childShapeIndexA >= 0)\n"
+"		{\n"
+"			collidableIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"			float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"			float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"			float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"			float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"			posA = newPosA;\n"
+"			ornA = newOrnA;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		}\n"
+"		\n"
+"		if (childShapeIndexB>=0)\n"
+"		{\n"
+"			collidableIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"			float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"			float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"			float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"			float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"			posB = newPosB;\n"
+"			ornB = newOrnB;\n"
+"		} else\n"
+"		{\n"
+"			collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;	\n"
+"		}\n"
+"	\n"
+"		gpuHasCompoundSepNormalsOut[i] = 0;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"	\n"
+"		int hasSeparatingAxis = 5;\n"
+"							\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"		float dmin = FLT_MAX;\n"
+"		posA.w = 0.f;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal = make_float4(1,0,0,0);\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,posA,ornA,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis = 0;\n"
+"			} else//(!sepB)\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,posB,ornB,DeltaC2,vertices,uniqueEdges,faces,indices,&sepNormal,&dmin);\n"
+"				if (sepEE)\n"
+"				{\n"
+"						gpuCompoundSepNormalsOut[i] = sepNormal;//fastNormalize4(sepNormal);\n"
+"						gpuHasCompoundSepNormalsOut[i] = 1;\n"
+"				}//sepEE\n"
+"			}//(!sepB)\n"
+"		}//(!sepA)\n"
+"		\n"
+"		\n"
+"	}\n"
+"		\n"
+"}\n"
+"\n"
+"// work-in-progress\n"
+"__kernel void   findCompoundPairsKernel( __global const int2* pairs, \n"
+"	__global const BodyData* rigidBodies, \n"
+"	__global const btCollidableGpu* collidables,\n"
+"	__global const ConvexPolyhedronCL* convexShapes, \n"
+"	__global const float4* vertices,\n"
+"	__global const float4* uniqueEdges,\n"
+"	__global const btGpuFace* faces,\n"
+"	__global const int* indices,\n"
+"	__global btAabbCL* aabbs,\n"
+"	__global const btGpuChildShape* gpuChildShapes,\n"
+"	__global volatile int4* gpuCompoundPairsOut,\n"
+"	__global volatile int* numCompoundPairsOut,\n"
+"	int numPairs,\n"
+"	int maxNumCompoundPairsCapacity\n"
+"	)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"\n"
+"\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"		{\n"
+"\n"
+"			if (collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) \n"
+"			{\n"
+"\n"
+"				int numChildrenA = collidables[collidableIndexA].m_numChildShapes;\n"
+"				for (int c=0;c<numChildrenA;c++)\n"
+"				{\n"
+"					int childShapeIndexA = collidables[collidableIndexA].m_shapeIndex+c;\n"
+"					int childColIndexA = gpuChildShapes[childShapeIndexA].m_shapeIndex;\n"
+"\n"
+"					float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"					float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"					float4 childPosA = gpuChildShapes[childShapeIndexA].m_childPosition;\n"
+"					float4 childOrnA = gpuChildShapes[childShapeIndexA].m_childOrientation;\n"
+"					float4 newPosA = qtRotate(ornA,childPosA)+posA;\n"
+"					float4 newOrnA = qtMul(ornA,childOrnA);\n"
+"\n"
+"					int shapeIndexA = collidables[childColIndexA].m_shapeIndex;\n"
+"\n"
+"					if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"					{\n"
+"						int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"						for (int b=0;b<numChildrenB;b++)\n"
+"						{\n"
+"							int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"							int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"							float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"							float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"							float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"							float4 newPosB = transform(&childPosB,&posB,&ornB);\n"
+"							float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"\n"
+"							int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"\n"
+"							if (1)\n"
+"							{\n"
+"								int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"								float dmin = FLT_MAX;\n"
+"								float4 posA = newPosA;\n"
+"								posA.w = 0.f;\n"
+"								float4 posB = newPosB;\n"
+"								posB.w = 0.f;\n"
+"								float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"								float4 ornA = newOrnA;\n"
+"								float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"								float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"								float4 ornB =newOrnB;\n"
+"								float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"								const float4 DeltaC2 = c0 - c1;\n"
+"\n"
+"								{//\n"
+"									int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"									if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"									{\n"
+"										gpuCompoundPairsOut[compoundPairIdx]  = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,childShapeIndexB);\n"
+"									}\n"
+"								}//\n"
+"							}//fi(1)\n"
+"						} //for (int b=0\n"
+"					}//if (collidables[collidableIndexB].\n"
+"					else//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"					{\n"
+"						if (1)\n"
+"						{\n"
+"							int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"							float dmin = FLT_MAX;\n"
+"							float4 posA = newPosA;\n"
+"							posA.w = 0.f;\n"
+"							float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"							posB.w = 0.f;\n"
+"							float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"							float4 ornA = newOrnA;\n"
+"							float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"							float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"							float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"							float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"							const float4 DeltaC2 = c0 - c1;\n"
+"\n"
+"							{\n"
+"								int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"								if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"								{\n"
+"									gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,childShapeIndexA,-1);\n"
+"								}//if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"							}//\n"
+"						}//fi (1)\n"
+"					}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"				}//for (int b=0;b<numChildrenB;b++)	\n"
+"				return;\n"
+"			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"			if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"			{\n"
+"				int numChildrenB = collidables[collidableIndexB].m_numChildShapes;\n"
+"				for (int b=0;b<numChildrenB;b++)\n"
+"				{\n"
+"					int childShapeIndexB = collidables[collidableIndexB].m_shapeIndex+b;\n"
+"					int childColIndexB = gpuChildShapes[childShapeIndexB].m_shapeIndex;\n"
+"					float4 ornB = rigidBodies[bodyIndexB].m_quat;\n"
+"					float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"					float4 childPosB = gpuChildShapes[childShapeIndexB].m_childPosition;\n"
+"					float4 childOrnB = gpuChildShapes[childShapeIndexB].m_childOrientation;\n"
+"					float4 newPosB = qtRotate(ornB,childPosB)+posB;\n"
+"					float4 newOrnB = qtMul(ornB,childOrnB);\n"
+"\n"
+"					int shapeIndexB = collidables[childColIndexB].m_shapeIndex;\n"
+"\n"
+"\n"
+"					//////////////////////////////////////\n"
+"\n"
+"					if (1)\n"
+"					{\n"
+"						int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"						float dmin = FLT_MAX;\n"
+"						float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"						posA.w = 0.f;\n"
+"						float4 posB = newPosB;\n"
+"						posB.w = 0.f;\n"
+"						float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"						float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"						float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"						float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"						float4 ornB =newOrnB;\n"
+"						float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"						const float4 DeltaC2 = c0 - c1;\n"
+"						{//\n"
+"							int compoundPairIdx = atomic_inc(numCompoundPairsOut);\n"
+"							if (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"							{\n"
+"								gpuCompoundPairsOut[compoundPairIdx] = (int4)(bodyIndexA,bodyIndexB,-1,childShapeIndexB);\n"
+"							}//fi (compoundPairIdx<maxNumCompoundPairsCapacity)\n"
+"						}//\n"
+"					}//fi (1)	\n"
+"				}//for (int b=0;b<numChildrenB;b++)\n"
+"				return;\n"
+"			}//if (collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS)\n"
+"			return;\n"
+"		}//fi ((collidables[collidableIndexA].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS) ||(collidables[collidableIndexB].m_shapeType==SHAPE_COMPOUND_OF_CONVEX_HULLS))\n"
+"	}//i<numPairs\n"
+"}\n"
+"\n"
+"// work-in-progress\n"
+"__kernel void   findSeparatingAxisKernel( __global const int2* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global const ConvexPolyhedronCL* convexShapes, \n"
+"																					__global const float4* vertices,\n"
+"																					__global const float4* uniqueEdges,\n"
+"																					__global const btGpuFace* faces,\n"
+"																					__global const int* indices,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global volatile float4* separatingNormals,\n"
+"																					__global volatile int* hasSeparatingAxis,\n"
+"																					__global int4* concavePairsOut,\n"
+"																					__global float4* concaveSeparatingNormalsOut,\n"
+"																					__global volatile int* numConcavePairsOut,\n"
+"																					int numPairs,\n"
+"																					int maxNumConcavePairsCapacity\n"
+"																					)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"\n"
+"	\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))// && (collidables[collidableIndexB].m_shapeType==SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"	\n"
+"			int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"			int numActualConcaveConvexTests = 0;\n"
+"			\n"
+"			for (int f=0;f<numFacesA;f++)\n"
+"			{\n"
+"			\n"
+"				bool overlap = false;\n"
+"				\n"
+"				ConvexPolyhedronCL convexPolyhedronA;\n"
+"\n"
+"			//add 3 vertices of the triangle\n"
+"				convexPolyhedronA.m_numVertices = 3;\n"
+"				convexPolyhedronA.m_vertexOffset = 0;\n"
+"				float4	localCenter = make_float4(0.f,0.f,0.f,0.f);\n"
+"\n"
+"				btGpuFace face = faces[convexShapes[shapeIndexA].m_faceOffset+f];\n"
+"				float4 triMinAabb, triMaxAabb;\n"
+"				btAabbCL triAabb;\n"
+"				triAabb.m_min = make_float4(1e30f,1e30f,1e30f,0.f);\n"
+"				triAabb.m_max = make_float4(-1e30f,-1e30f,-1e30f,0.f);\n"
+"				\n"
+"				float4 verticesA[3];\n"
+"				for (int i=0;i<3;i++)\n"
+"				{\n"
+"					int index = indices[face.m_indexOffset+i];\n"
+"					float4 vert = vertices[convexShapes[shapeIndexA].m_vertexOffset+index];\n"
+"					verticesA[i] = vert;\n"
+"					localCenter += vert;\n"
+"#if 0\n"
+"//just in case some implementation doesn't support component-wise min and max for float4\n"
+"					if (triAabb.m_min.x > vert.x)\n"
+"						triAabb.m_min.x = vert.x;\n"
+"					if (triAabb.m_min.y > vert.y)\n"
+"						triAabb.m_min.y = vert.y;\n"
+"					if (triAabb.m_min.z > vert.z)\n"
+"						triAabb.m_min.z = vert.z;\n"
+"\n"
+"					if (triAabb.m_max.x < vert.x)\n"
+"						triAabb.m_max.x = vert.x;\n"
+"					if (triAabb.m_max.y < vert.y)\n"
+"						triAabb.m_max.y = vert.y;\n"
+"					if (triAabb.m_max.z < vert.z)\n"
+"						triAabb.m_max.z = vert.z;\n"
+"#else				\n"
+"					triAabb.m_min = min(triAabb.m_min,vert);		\n"
+"					triAabb.m_max = max(triAabb.m_max,vert);		\n"
+"#endif					\n"
+"				}\n"
+"\n"
+"				overlap = true;\n"
+"				overlap = (triAabb.m_min.x > aabbs[bodyIndexB].m_max.x || triAabb.m_max.x < aabbs[bodyIndexB].m_min.x) ? false : overlap;\n"
+"				overlap = (triAabb.m_min.z > aabbs[bodyIndexB].m_max.z || triAabb.m_max.z < aabbs[bodyIndexB].m_min.z) ? false : overlap;\n"
+"				overlap = (triAabb.m_min.y > aabbs[bodyIndexB].m_max.y || triAabb.m_max.y < aabbs[bodyIndexB].m_min.y) ? false : overlap;\n"
+"					\n"
+"				if (overlap)\n"
+"				{\n"
+"					float dmin = FLT_MAX;\n"
+"					int hasSeparatingAxis=5;\n"
+"					float4 sepAxis=make_float4(1,2,3,4);\n"
+"\n"
+"#if 1\n"
+"					\n"
+"					int localCC=0;\n"
+"					numActualConcaveConvexTests++;\n"
+"\n"
+"					//a triangle has 3 unique edges\n"
+"					convexPolyhedronA.m_numUniqueEdges = 3;\n"
+"					convexPolyhedronA.m_uniqueEdgesOffset = 0;\n"
+"					float4 uniqueEdgesA[3];\n"
+"					\n"
+"					uniqueEdgesA[0] = (verticesA[1]-verticesA[0]);\n"
+"					uniqueEdgesA[1] = (verticesA[2]-verticesA[1]);\n"
+"					uniqueEdgesA[2] = (verticesA[0]-verticesA[2]);\n"
+"\n"
+"\n"
+"					convexPolyhedronA.m_faceOffset = 0;\n"
+"                                        \n"
+"					float4 normal = make_float4(face.m_plane.x,face.m_plane.y,face.m_plane.z,0.f);\n"
+"                                   \n"
+"					btGpuFace facesA[TRIANGLE_NUM_CONVEX_FACES];\n"
+"					int indicesA[3+3+2+2+2];\n"
+"					int curUsedIndices=0;\n"
+"					int fidx=0;\n"
+"\n"
+"					//front size of triangle\n"
+"					{\n"
+"						facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"						indicesA[0] = 0;\n"
+"						indicesA[1] = 1;\n"
+"						indicesA[2] = 2;\n"
+"						curUsedIndices+=3;\n"
+"						float c = face.m_plane.w;\n"
+"						facesA[fidx].m_plane.x = normal.x;\n"
+"						facesA[fidx].m_plane.y = normal.y;\n"
+"						facesA[fidx].m_plane.z = normal.z;\n"
+"						facesA[fidx].m_plane.w = c;\n"
+"						facesA[fidx].m_numIndices=3;\n"
+"					}\n"
+"					fidx++;\n"
+"					//back size of triangle\n"
+"					{\n"
+"						facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"						indicesA[3]=2;\n"
+"						indicesA[4]=1;\n"
+"						indicesA[5]=0;\n"
+"						curUsedIndices+=3;\n"
+"						float c = dot(normal,verticesA[0]);\n"
+"						float c1 = -face.m_plane.w;\n"
+"						facesA[fidx].m_plane.x = -normal.x;\n"
+"						facesA[fidx].m_plane.y = -normal.y;\n"
+"						facesA[fidx].m_plane.z = -normal.z;\n"
+"						facesA[fidx].m_plane.w = c;\n"
+"						facesA[fidx].m_numIndices=3;\n"
+"					}\n"
+"					fidx++;\n"
+"\n"
+"					bool addEdgePlanes = true;\n"
+"					if (addEdgePlanes)\n"
+"					{\n"
+"						int numVertices=3;\n"
+"						int prevVertex = numVertices-1;\n"
+"						for (int i=0;i<numVertices;i++)\n"
+"						{\n"
+"							float4 v0 = verticesA[i];\n"
+"							float4 v1 = verticesA[prevVertex];\n"
+"	                                                \n"
+"							float4 edgeNormal = normalize(cross(normal,v1-v0));\n"
+"							float c = -dot(edgeNormal,v0);\n"
+"	\n"
+"							facesA[fidx].m_numIndices = 2;\n"
+"							facesA[fidx].m_indexOffset=curUsedIndices;\n"
+"							indicesA[curUsedIndices++]=i;\n"
+"							indicesA[curUsedIndices++]=prevVertex;\n"
+"	                                                \n"
+"							facesA[fidx].m_plane.x = edgeNormal.x;\n"
+"							facesA[fidx].m_plane.y = edgeNormal.y;\n"
+"							facesA[fidx].m_plane.z = edgeNormal.z;\n"
+"							facesA[fidx].m_plane.w = c;\n"
+"							fidx++;\n"
+"							prevVertex = i;\n"
+"						}\n"
+"					}\n"
+"					convexPolyhedronA.m_numFaces = TRIANGLE_NUM_CONVEX_FACES;\n"
+"					convexPolyhedronA.m_localCenter = localCenter*(1.f/3.f);\n"
+"			\n"
+"\n"
+"					float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"					posA.w = 0.f;\n"
+"					float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"					posB.w = 0.f;\n"
+"					float4 c0local = convexPolyhedronA.m_localCenter;\n"
+"					float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"					float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"					float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"					float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"					float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"					const float4 DeltaC2 = c0 - c1;\n"
+"					\n"
+"					bool sepA = findSeparatingAxisLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"																											rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"																											DeltaC2,\n"
+"																											verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"																											vertices,uniqueEdges,faces,indices,\n"
+"																											&sepAxis,&dmin);\n"
+"					hasSeparatingAxis = 4;\n"
+"					if (!sepA)\n"
+"					{\n"
+"						hasSeparatingAxis = 0;\n"
+"					} else\n"
+"					{\n"
+"						bool sepB = findSeparatingAxisLocalB(	&convexShapes[shapeIndexB],&convexPolyhedronA,rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"																												rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"																												DeltaC2,\n"
+"																												vertices,uniqueEdges,faces,indices,\n"
+"																												verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"																												&sepAxis,&dmin);\n"
+"			\n"
+"						if (!sepB)\n"
+"						{\n"
+"							hasSeparatingAxis = 0;\n"
+"						} else\n"
+"						{\n"
+"							bool sepEE = findSeparatingAxisEdgeEdgeLocalA(	&convexPolyhedronA, &convexShapes[shapeIndexB],rigidBodies[bodyIndexA].m_pos,rigidBodies[bodyIndexA].m_quat,\n"
+"																												rigidBodies[bodyIndexB].m_pos,rigidBodies[bodyIndexB].m_quat,\n"
+"																												DeltaC2,\n"
+"																												verticesA,uniqueEdgesA,facesA,indicesA,\n"
+"																												vertices,uniqueEdges,faces,indices,\n"
+"																												&sepAxis,&dmin);\n"
+"				\n"
+"							if (!sepEE)\n"
+"							{\n"
+"								hasSeparatingAxis = 0;\n"
+"							} else\n"
+"							{\n"
+"								hasSeparatingAxis = 1;\n"
+"							}\n"
+"						}\n"
+"					}	\n"
+"#endif\n"
+"					\n"
+"					if (hasSeparatingAxis)\n"
+"					{\n"
+"						int pairIdx = atomic_inc(numConcavePairsOut);\n"
+"						if (pairIdx<maxNumConcavePairsCapacity)\n"
+"						{\n"
+"							concavePairsOut[pairIdx].x = bodyIndexA;\n"
+"							concavePairsOut[pairIdx].y = bodyIndexB;\n"
+"							concavePairsOut[pairIdx].z = f;\n"
+"							concavePairsOut[pairIdx].w = 3;\n"
+"							sepAxis.w = dmin;\n"
+"							concaveSeparatingNormalsOut[pairIdx]=sepAxis;\n"
+"						}\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"			//todo//??\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}		\n"
+"\n"
+"	\n"
+"\n"
+"		if ((collidables[collidableIndexA].m_shapeType!=SHAPE_CONVEX_HULL) ||(collidables[collidableIndexB].m_shapeType!=SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"			return;\n"
+"		}\n"
+"			\n"
+"\n"
+"\n"
+"\n"
+"		\n"
+"		int numFacesA = convexShapes[shapeIndexA].m_numFaces;\n"
+"\n"
+"		float dmin = FLT_MAX;\n"
+"\n"
+"		float4 posA = rigidBodies[bodyIndexA].m_pos;\n"
+"		posA.w = 0.f;\n"
+"		float4 posB = rigidBodies[bodyIndexB].m_pos;\n"
+"		posB.w = 0.f;\n"
+"		float4 c0local = convexShapes[shapeIndexA].m_localCenter;\n"
+"		float4 ornA = rigidBodies[bodyIndexA].m_quat;\n"
+"		float4 c0 = transform(&c0local, &posA, &ornA);\n"
+"		float4 c1local = convexShapes[shapeIndexB].m_localCenter;\n"
+"		float4 ornB =rigidBodies[bodyIndexB].m_quat;\n"
+"		float4 c1 = transform(&c1local,&posB,&ornB);\n"
+"		const float4 DeltaC2 = c0 - c1;\n"
+"		float4 sepNormal;\n"
+"		\n"
+"		bool sepA = findSeparatingAxis(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																								posB,ornB,\n"
+"																								DeltaC2,\n"
+"																								vertices,uniqueEdges,faces,\n"
+"																								indices,&sepNormal,&dmin);\n"
+"		hasSeparatingAxis[i] = 4;\n"
+"		if (!sepA)\n"
+"		{\n"
+"			hasSeparatingAxis[i] = 0;\n"
+"		} else\n"
+"		{\n"
+"			bool sepB = findSeparatingAxis(	&convexShapes[shapeIndexB],&convexShapes[shapeIndexA],posB,ornB,\n"
+"																									posA,ornA,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"\n"
+"			if (!sepB)\n"
+"			{\n"
+"				hasSeparatingAxis[i] = 0;\n"
+"			} else\n"
+"			{\n"
+"				bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,uniqueEdges,faces,\n"
+"																									indices,&sepNormal,&dmin);\n"
+"				if (!sepEE)\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 0;\n"
+"				} else\n"
+"				{\n"
+"					hasSeparatingAxis[i] = 1;\n"
+"					separatingNormals[i] = sepNormal;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"	}\n"
+"\n"
+"}\n"
+;
diff --git a/opencl/gpu_sat/test/main.cpp b/opencl/gpu_sat/test/main.cpp
new file mode 100644
index 000000000..b40a82a9f
--- /dev/null
+++ b/opencl/gpu_sat/test/main.cpp
@@ -0,0 +1,112 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <stdio.h>
+#include "../basic_initialize/btOpenCLUtils.h"
+#include "../host/ConvexHullContact.h"
+
+#include "parallel_primitives/host/btVector3.h"
+#include "parallel_primitives/host/btFillCL.h"
+#include "parallel_primitives/host/btBoundSearchCL.h"
+#include "parallel_primitives/host/btRadixSort32CL.h"
+#include "parallel_primitives/host/btPrefixScanCL.h"
+#include "parallel_primitives/host/CommandLineArgs.h"
+#include "../host/ConvexHullContact.h"
+
+#include "parallel_primitives/host/btMinMax.h"
+int g_nPassed = 0;
+int g_nFailed = 0;
+bool g_testFailed = 0;
+
+#define TEST_INIT g_testFailed = 0;
+#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
+#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+cl_context g_context=0;
+cl_device_id g_device=0;
+cl_command_queue g_queue =0;
+const char* g_deviceName = 0;
+
+void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	void* glCtx=0;
+	void* glDC = 0;
+	int ciErrNum = 0;
+	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+
+	g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	int numDev = btOpenCLUtils::getNumDevices(g_context);
+	if (numDev>0)
+	{
+		btOpenCLDeviceInfo info;
+		g_device= btOpenCLUtils::getDevice(g_context,0);
+		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+        btOpenCLUtils::printDeviceInfo(g_device);
+		btOpenCLUtils::getDeviceInfo(g_device,&info);
+		g_deviceName = info.m_deviceName;
+	}
+}
+
+void exitCL()
+{
+	clReleaseCommandQueue(g_queue);
+	clReleaseContext(g_context);
+}
+
+
+inline void gpuConvexHullContactTest()
+{
+	TEST_INIT;
+
+	TEST_ASSERT(1);
+
+	GpuSatCollision* sat = new GpuSatCollision(g_context,g_device,g_queue);
+
+	delete sat;
+
+	TEST_REPORT( "gpuConvexHullContactTest" );
+}
+
+int main(int argc, char** argv)
+{
+	int preferredDeviceIndex = -1;
+	int preferredPlatformIndex = -1;
+
+	CommandLineArgs args(argc, argv);
+	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
+	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
+
+	initCL(preferredDeviceIndex,preferredPlatformIndex);
+
+	gpuConvexHullContactTest();
+
+	printf("%d tests passed\n",g_nPassed, g_nFailed);
+	if (g_nFailed)
+	{
+		printf("%d tests failed\n",g_nFailed);
+	}
+	printf("End, press <enter>\n");
+
+	getchar();
+
+	exitCL();
+
+}
+
diff --git a/opencl/gpu_sat/test/premake4.lua b/opencl/gpu_sat/test/premake4.lua
new file mode 100644
index 000000000..b4cbae720
--- /dev/null
+++ b/opencl/gpu_sat/test/premake4.lua
@@ -0,0 +1,46 @@
+function createProject(vendor)	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("OpenCL_sat_test_" .. vendor)
+
+		initOpenCL(vendor)
+
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {"..","../.."}
+		
+		
+		files {
+			"main.cpp",
+			"../../basic_initialize/btOpenCLInclude.h",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../host/ConvexHullContact.cpp",
+			"../host/ConvexHullContact.h",
+			"../../parallel_primitives/host/btFillCL.cpp",
+			"../../parallel_primitives/host/btFillCL.h",
+			"../../parallel_primitives/host/btBoundSearchCL.cpp",
+			"../../parallel_primitives/host/btBoundSearchCL.h",
+			"../../parallel_primitives/host/btPrefixScanCL.cpp",
+			"../../parallel_primitives/host/btPrefixScanCL.h",
+			"../../parallel_primitives/host/btRadixSort32CL.cpp",
+			"../../parallel_primitives/host/btRadixSort32CL.h",
+			"../../parallel_primitives/host/btAlignedAllocator.cpp",
+			"../../parallel_primitives/host/btAlignedAllocator.h",
+			"../../parallel_primitives/host/btAlignedObjectArray.h",
+			"../../parallel_primitives/host/btQuickprof.cpp",
+			"../../parallel_primitives/host/btQuickprof.h",
+
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
\ No newline at end of file
diff --git a/opencl/parallel_primitives/host/btFillCL.h b/opencl/parallel_primitives/host/btFillCL.h
index a9303a73d..9de498b3d 100644
--- a/opencl/parallel_primitives/host/btFillCL.h
+++ b/opencl/parallel_primitives/host/btFillCL.h
@@ -4,83 +4,9 @@
 #include "btOpenCLArray.h"
 #include "btScalar.h"
 
-ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
-{
-	BT_DECLARE_ALIGNED_ALLOCATOR();
+#include "btInt2.h"
+#include "btInt4.h"
 
-	union
-	{
-		struct
-		{
-			unsigned int x,y,z,w;
-		};
-		struct
-		{
-			unsigned int s[4];
-		};
-	};
-};
-
-ATTRIBUTE_ALIGNED16(struct) btInt4
-{
-	BT_DECLARE_ALIGNED_ALLOCATOR();
-
-	union
-	{
-		struct
-		{
-			int x,y,z,w;
-		};
-		struct
-		{
-			int s[4];
-		};
-	};
-};
-
-struct btUnsignedInt2
-{
-	union
-	{
-		struct
-		{
-			unsigned int x,y;
-		};
-		struct
-		{
-			unsigned int s[2];
-		};
-	};
-};
-
-struct btInt2
-{
-	union
-	{
-		struct
-		{
-			int x,y;
-		};
-		struct
-		{
-			int s[2];
-		};
-	};
-};
-
-SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
-{
-	btInt4 v;
-	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
-	return v;
-}
-
-SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
-{
-	btUnsignedInt4 v;
-	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
-	return v;
-}
 
 class btFillCL
 {
diff --git a/opencl/parallel_primitives/host/btInt2.h b/opencl/parallel_primitives/host/btInt2.h
new file mode 100644
index 000000000..a0a2977d8
--- /dev/null
+++ b/opencl/parallel_primitives/host/btInt2.h
@@ -0,0 +1,35 @@
+#ifndef BT_INT2_H
+#define BT_INT2_H
+
+struct btUnsignedInt2
+{
+	union
+	{
+		struct
+		{
+			unsigned int x,y;
+		};
+		struct
+		{
+			unsigned int s[2];
+		};
+	};
+};
+
+struct btInt2
+{
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+
+#endif
\ No newline at end of file
diff --git a/opencl/parallel_primitives/host/btInt4.h b/opencl/parallel_primitives/host/btInt4.h
new file mode 100644
index 000000000..c4539e481
--- /dev/null
+++ b/opencl/parallel_primitives/host/btInt4.h
@@ -0,0 +1,55 @@
+#ifndef BT_INT4_H
+#define BT_INT4_H
+
+#include "btScalar.h"
+
+ATTRIBUTE_ALIGNED16(struct) btUnsignedInt4
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	union
+	{
+		struct
+		{
+			unsigned int x,y,z,w;
+		};
+		struct
+		{
+			unsigned int s[4];
+		};
+	};
+};
+
+ATTRIBUTE_ALIGNED16(struct) btInt4
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+SIMD_FORCE_INLINE btInt4 btMakeInt4(int x, int y, int z, int w = 0)
+{
+	btInt4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+SIMD_FORCE_INLINE btUnsignedInt4 btMakeUnsignedInt4(unsigned int x, unsigned int y, unsigned int z, unsigned int w = 0)
+{
+	btUnsignedInt4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+
+#endif //BT_INT4_H