Enable MPR by default

Add the contact point from MPR, in addition to SAT/clipping contacts. Added a new kernel to clear/reset the number of contacts in pairs (stored in the z component) Always sample unit sphere directions, if there are more edge-edge combinations than unit sphere directions (162 by default) Remember last running demo for Bullet 3 (and save it in a text file, Bullet Enable the testFileFracture.bullet in the Bullet2FileDemo
2014-01-15 10:35:51 -08:00
parent ca86e135de
commit 472a4068fc
27 changed files with 573 additions and 65 deletions
--- a/Demos3/AllBullet2Demos/BulletDemoEntries.h
+++ b/Demos3/AllBullet2Demos/BulletDemoEntries.h
@@ -9,7 +9,6 @@

 #include "../bullet2/RagdollDemo/RagdollDemo.h"

-#include <stdio.h>

 struct BulletDemoEntry
 {
@@ -30,10 +29,10 @@ static BulletDemoEntry allDemos[]=

 };

+#include <stdio.h>

-static const char* startFileName = "startDemo.txt";

-static void saveCurrentDemoEntry(int currentEntry)
+static void saveCurrentDemoEntry(int currentEntry,const char* startFileName)
 {
 	FILE* f = fopen(startFileName,"w");
 	if (f)
@@ -43,7 +42,7 @@ static void saveCurrentDemoEntry(int currentEntry)
 	}
 };

-static int loadCurrentDemoEntry()
+static int loadCurrentDemoEntry(const char* startFileName)
 {
 	int currentEntry= 0;
 	FILE* f = fopen(startFileName,"r");
--- a/Demos3/AllBullet2Demos/main.cpp
+++ b/Demos3/AllBullet2Demos/main.cpp
@@ -7,7 +7,7 @@
 #include "BulletDemoEntries.h"

 #define DEMO_SELECTION_COMBOBOX 13
-
+const char* startFileName = "bulletDemo.txt";
 static SimpleOpenGL3App* app=0;
 static GwenUserInterface* gui  = 0;
 static int sCurrentDemoIndex = 0;
@@ -120,7 +120,7 @@ void	MyComboBoxCallback(int comboId, const char* item)
 			if (strcmp(item,allNames[i])==0)
 			{
 				selectDemo(i);
-				saveCurrentDemoEntry(sCurrentDemoIndex);
+				saveCurrentDemoEntry(sCurrentDemoIndex,startFileName);
 				break;
 			}
 		}
@@ -157,7 +157,7 @@ int main(int argc, char* argv[])
 		allNames.push_back(allDemos[i].m_name);
 	}
 		
-	selectDemo(loadCurrentDemoEntry());
+	selectDemo(loadCurrentDemoEntry(startFileName));
 	gui->registerComboBox(DEMO_SELECTION_COMBOBOX,allNames.size(),&allNames[0],sCurrentDemoIndex);
 		
 	//const char* names2[] = {"comboF", "comboG","comboH"};
--- a/Demos3/GpuDemos/main_opengl3core.cpp
+++ b/Demos3/GpuDemos/main_opengl3core.cpp
@@ -75,6 +75,31 @@ extern bool gUseCalculateOverlappingPairsHost;
 extern bool gIntegrateOnCpu;
 extern bool gConvertConstraintOnCpu;

+
+static const char* sStartFileName = "bullet3StartDemo.txt";
+
+static void saveCurrentDemoEntry(int currentEntry,const char* startFileName)
+{
+	FILE* f = fopen(startFileName,"w");
+	if (f)
+	{
+		fprintf(f,"%d\n",currentEntry);
+		fclose(f);
+	}
+};
+
+static int loadCurrentDemoEntry(const char* startFileName)
+{
+	int currentEntry= 0;
+	FILE* f = fopen(startFileName,"r");
+	if (f)
+	{
+		fscanf(f,"%d",&currentEntry);
+		fclose(f);
+	}
+	return currentEntry;
+};
+
 static void MyResizeCallback( float width, float height)
 {
 	g_OpenGLWidth = width;
@@ -180,6 +205,7 @@ void	MyComboBoxCallback(int comboId, const char* item)
 				gReset = true;
 				selectedDemo = i;
 				printf("selected demo %s!\n", item);
+				saveCurrentDemoEntry(i,sStartFileName);
 			}
 		}
 	}
@@ -611,6 +637,8 @@ int main(int argc, char* argv[])
 		return 0;
 	}

+	selectedDemo =  loadCurrentDemoEntry(sStartFileName);
+

 	args.GetCmdLineArgument("selected_demo",selectedDemo);

--- a/Demos3/GpuDemos/rigidbody/Bullet2FileDemo.cpp
+++ b/Demos3/GpuDemos/rigidbody/Bullet2FileDemo.cpp
@@ -17,8 +17,8 @@ void Bullet2FileDemo::setupScene(const ConstructionInfo& ci)
 {
 	b3Assert(ci.m_instancingRenderer);

-	const char* fileName="data/testFile.bullet";
-//const char* fileName="data/testFileFracture.bullet";
+	//const char* fileName="data/testFile.bullet";
+	const char* fileName="data/testFileFracture.bullet";

 	FILE* f = 0;

--- a/Demos3/GpuDemos/rigidbody/ConcaveScene.cpp
+++ b/Demos3/GpuDemos/rigidbody/ConcaveScene.cpp
@@ -233,7 +233,7 @@ void ConcaveScene::setupScene(const ConstructionInfo& ci)

 	//	b3Vector3 shift3(130,-150,-75);//0,230,80);//150,-100,-120);
 	//	createConcaveMesh(ci,"leoTest1.obj",shift3,scaling);
-		createConcaveMesh(ci,"samurai_monastry.obj",shift1,scaling);	
+		createConcaveMesh(ci,fileName,shift1,scaling);	
 				
 	} else
 	{
--- a/data/plane100.obj
+++ b/data/plane100.obj
@@ -8,5 +8,5 @@ v -100.000000 0.000000 100.000000
 v -100.000000 0.000000 -100.000000
 usemtl Material
 s off
-f 1 2 3
-f 1 3 4
+f 3 2 1
+f 4 3 1
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.cpp
@@ -16,8 +16,7 @@ subject to the following restrictions:
 bool findSeparatingAxisOnGpu = true;
 bool splitSearchSepAxisConcave = false;
 bool splitSearchSepAxisConvex = true;
-bool useMprGpu = false;//use mpr for edge-edge  (+contact point) or sat. Needs testing on main OpenCL platforms, before enabling...
-bool useUnitDirections = false;
+bool useMprGpu = true;//use mpr for edge-edge  (+contact point) or sat. Needs testing on main OpenCL platforms, before enabling...
 bool bvhTraversalKernelGPU = true;
 bool findConcaveSeparatingAxisKernelGPU = true;
 bool clipConcaveFacesAndFindContactsCPU = false;//false;//true;
@@ -136,8 +135,7 @@ m_unitSphereDirections(m_context,m_queue)
 //		sprintf(flags,"-g -s \"%s\"","C:/develop/bullet3_experiments2/opencl/gpu_narrowphase/kernels/sat.cl");
 //#endif
 		m_mprPenetrationKernel  = 0;
-		m_findSeparatingAxisUnitSphereKernel = 0;
-		if (useMprGpu||useUnitDirections)
+		if (useMprGpu)
 		{
 			cl_program mprProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,mprSrc,&errNum,flags,BT_NARROWPHASE_MPR_PATH);
 			b3Assert(errNum==CL_SUCCESS);
@@ -146,9 +144,6 @@ m_unitSphereDirections(m_context,m_queue)
 			b3Assert(m_mprPenetrationKernel);
 			b3Assert(errNum==CL_SUCCESS);

-			m_findSeparatingAxisUnitSphereKernel =  b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,mprSrc, "findSeparatingAxisUnitSphereKernel",&errNum,mprProg );
-			b3Assert(m_findSeparatingAxisUnitSphereKernel);
-			b3Assert(errNum==CL_SUCCESS);

 			int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
 			m_unitSphereDirections.resize(numDirections);
@@ -290,8 +285,6 @@ GpuSatCollision::~GpuSatCollision()
 	if (m_mprPenetrationKernel)
 		clReleaseKernel(m_mprPenetrationKernel);

-	if (m_findSeparatingAxisUnitSphereKernel)
-		clReleaseKernel(m_findSeparatingAxisUnitSphereKernel);

 	if (m_findSeparatingAxisKernel)
 		clReleaseKernel(m_findSeparatingAxisKernel);
@@ -3169,7 +3162,7 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>*

 					}
 				}
-				// else
+				
 				if (1)
 				{

@@ -3200,35 +3193,8 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>*
 					}


-				
-					
-					if (useUnitDirections)
-					{
-						B3_PROFILE("findSeparatingAxisUnitSphereKernel");
-						b3BufferInfoCL bInfo[] = { 
-							b3BufferInfoCL( pairs->getBufferCL(), true ), 
-							b3BufferInfoCL( bodyBuf->getBufferCL(),true), 
-							b3BufferInfoCL( gpuCollidables.getBufferCL(),true), 
-							b3BufferInfoCL( convexData.getBufferCL(),true),
-							b3BufferInfoCL( gpuVertices.getBufferCL(),true),
-							b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true),
-							b3BufferInfoCL( m_sepNormals.getBufferCL()),
-							b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
-							b3BufferInfoCL( m_dmins.getBufferCL())
-						};
-
-						b3LauncherCL launcher(m_queue, m_findSeparatingAxisUnitSphereKernel,"findSeparatingAxisUnitSphereKernel");
-						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
 					int numDirections = sizeof(unitSphere162)/sizeof(b3Vector3);
-						launcher.setConst( numDirections);
 					
-						launcher.setConst( nPairs  );
-						
-						int num = nPairs;
-						launcher.launch1D( num);
-						clFinish(m_queue);
-
-					} else
 					{
 					B3_PROFILE("findSeparatingAxisEdgeEdgeKernel");
 					b3BufferInfoCL bInfo[] = { 
@@ -3243,11 +3209,14 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( b3OpenCLArray<b3Int4>*
 						b3BufferInfoCL( clAabbsWorldSpace.getBufferCL(),true),
 						b3BufferInfoCL( m_sepNormals.getBufferCL()),
 						b3BufferInfoCL( m_hasSeparatingNormals.getBufferCL()),
-						b3BufferInfoCL( m_dmins.getBufferCL())
+						b3BufferInfoCL( m_dmins.getBufferCL()),
+						b3BufferInfoCL( m_unitSphereDirections.getBufferCL(),true)
+
 					};

 					b3LauncherCL launcher(m_queue, m_findSeparatingAxisEdgeEdgeKernel,"findSeparatingAxisEdgeEdgeKernel");
 					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setConst( numDirections);
 					launcher.setConst( nPairs  );

 					int num = nPairs;
--- a/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/b3ConvexHullContact.h
@@ -27,7 +27,6 @@ struct GpuSatCollision
 	cl_command_queue		m_queue;
 	cl_kernel				m_findSeparatingAxisKernel;
 	cl_kernel				m_mprPenetrationKernel;
-	cl_kernel				m_findSeparatingAxisUnitSphereKernel;


 	cl_kernel m_findSeparatingAxisVertexFaceKernel;
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mpr.cl
@@ -68,6 +68,7 @@ __kernel void   mprPenetrationKernel( __global int4* pairs,
 			AppendInc( nGlobalContactsOut, dstIdx );
 			if (dstIdx<contactCapacity)
 			{
+				pairs[pairIndex].z = dstIdx;
 				__global struct b3Contact4Data* c = globalContactsOut + dstIdx;
 				c->m_worldNormalOnB = -dirOut;//normal;
 				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h
@@ -1223,6 +1223,7 @@ static const char* mprKernelsCL= \
 "			AppendInc( nGlobalContactsOut, dstIdx );\n"
 "			if (dstIdx<contactCapacity)\n"
 "			{\n"
+"				pairs[pairIndex].z = dstIdx;\n"
 "				__global struct b3Contact4Data* c = globalContactsOut + dstIdx;\n"
 "				c->m_worldNormalOnB = -dirOut;//normal;\n"
 "				c->m_restituitionCoeffCmp = (0.f*0xffff);c->m_frictionCoeffCmp = (0.7f*0xffff);\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/primitiveContacts.h
@@ -12,6 +12,11 @@ static const char* primitiveContactsKernelsCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -48,6 +53,29 @@ static const char* primitiveContactsKernelsCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "typedef  struct b3Contact4Data b3Contact4Data_t;\n"
 "struct b3Contact4Data\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/sat.cl
@@ -653,6 +653,69 @@ bool findSeparatingAxis(	__global const ConvexPolyhedronCL* hullA, __global cons



+bool findSeparatingAxisUnitSphere(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
+	const float4 posA1,
+	const float4 ornA,
+	const float4 posB1,
+	const float4 ornB,
+	const float4 DeltaC2,
+	__global const float4* vertices,
+	__global const float4* unitSphereDirections,
+	int numUnitSphereDirections,
+	float4* sep,
+	float* dmin)
+{
+	
+	float4 posA = posA1;
+	posA.w = 0.f;
+	float4 posB = posB1;
+	posB.w = 0.f;
+
+	int curPlaneTests=0;
+
+	int curEdgeEdge = 0;
+	// Test unit sphere directions
+	for (int i=0;i<numUnitSphereDirections;i++)
+	{
+
+		float4 crossje;
+		crossje = unitSphereDirections[i];	
+
+		if (dot3F4(DeltaC2,crossje)>0)
+			crossje *= -1.f;
+		{
+			float dist;
+			bool result = true;
+			float Min0,Max0;
+			float Min1,Max1;
+			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);
+			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);
+		
+			if(Max0<Min1 || Max1<Min0)
+				return false;
+		
+			float d0 = Max0 - Min1;
+			float d1 = Max1 - Min0;
+			dist = d0<d1 ? d0:d1;
+			result = true;
+	
+			if(dist<*dmin)
+			{
+				*dmin = dist;
+				*sep = crossje;
+			}
+		}
+	}
+
+	
+	if((dot3F4(-DeltaC2,*sep))>0.0f)
+	{
+		*sep = -(*sep);
+	}
+	return true;
+}
+
+
 bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, 
 	const float4 posA1,
 	const float4 ornA,
@@ -1458,6 +1521,8 @@ __kernel void   findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs,
 																					__global  float4* separatingNormals,
 																					__global  int* hasSeparatingAxis,
 																					__global  float* dmins,
+																					__global const float4* unitSphereDirections,
+																					int numUnitSphereDirections,
 																					int numPairs
 																					)
 {
@@ -1497,11 +1562,26 @@ __kernel void   findSeparatingAxisEdgeEdgeKernel( __global const int4* pairs,
 			const float4 DeltaC2 = c0 - c1;
 			float4 sepNormal = separatingNormals[i];
 			
-			bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+			
+			
+			bool sepEE = false;
+			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;
+			if (numEdgeEdgeDirections<numUnitSphereDirections)
+			{
+				sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
 																									posB,ornB,
 																									DeltaC2,
 																									vertices,uniqueEdges,faces,
 																									indices,&sepNormal,&dmin);
+			}
+			else
+			{
+				sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,
+																									posB,ornB,
+																									DeltaC2,
+																									vertices,unitSphereDirections,numUnitSphereDirections,
+																									&sepNormal,&dmin);
+			}
 			if (!sepEE)
 			{
 				hasSeparatingAxis[i] = 0;
@@ -1918,6 +1998,8 @@ __kernel void   findConcaveSeparatingAxisKernel( __global int4* concavePairs,
 		//mark this pair as in-active
 		concavePairs[pairIdx].w = -1;
 	}
+	
+	concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts
 }


--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.cl
@@ -1818,9 +1818,14 @@ __kernel void   newContactReductionKernel( __global int4* pairs,
                
                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);
            
-                int dstIdx;
-                AppendInc( nGlobalContactsOut, dstIdx );
+				int mprContactIndex = pairs[pairIndex].z;

+                int dstIdx = mprContactIndex;
+
+				if (dstIdx<0)
+				{
+	                AppendInc( nGlobalContactsOut, dstIdx );
+				}
 //#if 0
                
 				if (dstIdx < contactCapacity)
@@ -1849,6 +1854,7 @@ __kernel void   newContactReductionKernel( __global int4* pairs,
                        case 2:
                            c->m_worldPosB[1] = pointsIn[contactIdx.y];
                        case 1:
+							if (mprContactIndex<0)//test
 	                            c->m_worldPosB[0] = pointsIn[contactIdx.x];
                        default:
                        {
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satClipHullContacts.h
@@ -2036,9 +2036,12 @@ static const char* satClipKernelsCL= \
 "                \n"
 "                int nReducedContacts = extractManifoldSequentialGlobal(pointsIn, nPoints, normal, &contactIdx);\n"
 "            \n"
-"                int dstIdx;\n"
+"				int mprContactIndex = pairs[pairIndex].z;\n"
+"                int dstIdx = mprContactIndex;\n"
+"				if (dstIdx<0)\n"
+"				{\n"
 "	                AppendInc( nGlobalContactsOut, dstIdx );\n"
-"				\n"
+"				}\n"
 "//#if 0\n"
 "                \n"
 "				if (dstIdx < contactCapacity)\n"
@@ -2063,6 +2066,7 @@ static const char* satClipKernelsCL= \
 "                        case 2:\n"
 "                            c->m_worldPosB[1] = pointsIn[contactIdx.y];\n"
 "                        case 1:\n"
+"							if (mprContactIndex<0)//test\n"
 "	                            c->m_worldPosB[0] = pointsIn[contactIdx.x];\n"
 "                        default:\n"
 "                        {\n"
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcave.cl
@@ -1214,5 +1214,7 @@ __kernel void   findConcaveSeparatingAxisEdgeEdgeKernel( __global int4* concaveP
 		//mark this pair as in-active
 		concavePairs[pairIdx].w = -1;
 	}
+	
+	concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points
 }

--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h
@@ -153,6 +153,11 @@ static const char* satConcaveKernelsCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -189,6 +194,29 @@ static const char* satConcaveKernelsCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "#ifndef B3_MAT3x3_H\n"
 "#define B3_MAT3x3_H\n"
@@ -220,6 +248,7 @@ static const char* satConcaveKernelsCL= \
 "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
 "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
 "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
 "{\n"
 "	b3Quat ans;\n"
@@ -254,6 +283,10 @@ static const char* satConcaveKernelsCL= \
 "	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
 "{\n"
 "	return (b3Quat)(-q.xyz, q.w);\n"
@@ -1417,5 +1450,7 @@ static const char* satConcaveKernelsCL= \
 "		//mark this pair as in-active\n"
 "		concavePairs[pairIdx].w = -1;\n"
 "	}\n"
+"	\n"
+"	concavePairs[i].z = -1;//for the next stage, z is used to determine existing contact points\n"
 "}\n"
 ;
--- a/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
+++ b/src/Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h
@@ -888,6 +888,62 @@ static const char* satKernelsCL= \
 "	\n"
 "	return true;\n"
 "}\n"
+"bool findSeparatingAxisUnitSphere(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
+"	const float4 posA1,\n"
+"	const float4 ornA,\n"
+"	const float4 posB1,\n"
+"	const float4 ornB,\n"
+"	const float4 DeltaC2,\n"
+"	__global const float4* vertices,\n"
+"	__global const float4* unitSphereDirections,\n"
+"	int numUnitSphereDirections,\n"
+"	float4* sep,\n"
+"	float* dmin)\n"
+"{\n"
+"	\n"
+"	float4 posA = posA1;\n"
+"	posA.w = 0.f;\n"
+"	float4 posB = posB1;\n"
+"	posB.w = 0.f;\n"
+"	int curPlaneTests=0;\n"
+"	int curEdgeEdge = 0;\n"
+"	// Test unit sphere directions\n"
+"	for (int i=0;i<numUnitSphereDirections;i++)\n"
+"	{\n"
+"		float4 crossje;\n"
+"		crossje = unitSphereDirections[i];	\n"
+"		if (dot3F4(DeltaC2,crossje)>0)\n"
+"			crossje *= -1.f;\n"
+"		{\n"
+"			float dist;\n"
+"			bool result = true;\n"
+"			float Min0,Max0;\n"
+"			float Min1,Max1;\n"
+"			project(hullA,posA,ornA,&crossje,vertices, &Min0, &Max0);\n"
+"			project(hullB,posB,ornB,&crossje,vertices, &Min1, &Max1);\n"
+"		\n"
+"			if(Max0<Min1 || Max1<Min0)\n"
+"				return false;\n"
+"		\n"
+"			float d0 = Max0 - Min1;\n"
+"			float d1 = Max1 - Min0;\n"
+"			dist = d0<d1 ? d0:d1;\n"
+"			result = true;\n"
+"	\n"
+"			if(dist<*dmin)\n"
+"			{\n"
+"				*dmin = dist;\n"
+"				*sep = crossje;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"	\n"
+"	if((dot3F4(-DeltaC2,*sep))>0.0f)\n"
+"	{\n"
+"		*sep = -(*sep);\n"
+"	}\n"
+"	return true;\n"
+"}\n"
 "bool findSeparatingAxisEdgeEdge(	__global const ConvexPolyhedronCL* hullA, __global const ConvexPolyhedronCL* hullB, \n"
 "	const float4 posA1,\n"
 "	const float4 ornA,\n"
@@ -1601,6 +1657,8 @@ static const char* satKernelsCL= \
 "																					__global  float4* separatingNormals,\n"
 "																					__global  int* hasSeparatingAxis,\n"
 "																					__global  float* dmins,\n"
+"																					__global const float4* unitSphereDirections,\n"
+"																					int numUnitSphereDirections,\n"
 "																					int numPairs\n"
 "																					)\n"
 "{\n"
@@ -1638,11 +1696,26 @@ static const char* satKernelsCL= \
 "			const float4 DeltaC2 = c0 - c1;\n"
 "			float4 sepNormal = separatingNormals[i];\n"
 "			\n"
-"			bool sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"			\n"
+"			\n"
+"			bool sepEE = false;\n"
+"			int numEdgeEdgeDirections = convexShapes[shapeIndexA].m_numUniqueEdges*convexShapes[shapeIndexB].m_numUniqueEdges;\n"
+"			if (numEdgeEdgeDirections<numUnitSphereDirections)\n"
+"			{\n"
+"				sepEE = findSeparatingAxisEdgeEdge(	&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
 "																									posB,ornB,\n"
 "																									DeltaC2,\n"
 "																									vertices,uniqueEdges,faces,\n"
 "																									indices,&sepNormal,&dmin);\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				sepEE = findSeparatingAxisUnitSphere(&convexShapes[shapeIndexA], &convexShapes[shapeIndexB],posA,ornA,\n"
+"																									posB,ornB,\n"
+"																									DeltaC2,\n"
+"																									vertices,unitSphereDirections,numUnitSphereDirections,\n"
+"																									&sepNormal,&dmin);\n"
+"			}\n"
 "			if (!sepEE)\n"
 "			{\n"
 "				hasSeparatingAxis[i] = 0;\n"
@@ -2012,5 +2085,7 @@ static const char* satKernelsCL= \
 "		//mark this pair as in-active\n"
 "		concavePairs[pairIdx].w = -1;\n"
 "	}\n"
+"	\n"
+"	concavePairs[pairIdx].z = -1;//now z is used for existing/persistent contacts\n"
 "}\n"
 ;
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipeline.cpp
@@ -42,6 +42,7 @@ bool gDumpContactStats = false;
 bool gCalcWorldSpaceAabbOnCpu = false;
 bool gUseCalculateOverlappingPairsHost = false;
 bool gIntegrateOnCpu = false;
+bool gClearPairsOnGpu = true;

 #define TEST_OTHER_GPU_SOLVER 1
 #ifdef TEST_OTHER_GPU_SOLVER
@@ -106,6 +107,11 @@ b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id devic
 		b3Assert(errNum==CL_SUCCESS);
 		m_data->m_updateAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "initializeGpuAabbsFull",&errNum,prog);
 		b3Assert(errNum==CL_SUCCESS);
+
+
+		m_data->m_clearOverlappingPairsKernel = b3OpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,updateAabbsKernelCL, "clearOverlappingPairsKernel",&errNum,prog);
+		b3Assert(errNum==CL_SUCCESS);
+
 		clReleaseProgram(prog);
 	}

@@ -114,8 +120,14 @@ b3GpuRigidBodyPipeline::b3GpuRigidBodyPipeline(cl_context ctx,cl_device_id devic

 b3GpuRigidBodyPipeline::~b3GpuRigidBodyPipeline()
 {
+	if (m_data->m_integrateTransformsKernel)
 		clReleaseKernel(m_data->m_integrateTransformsKernel);
 	
+	if (m_data->m_updateAabbsKernel)
+		clReleaseKernel(m_data->m_updateAabbsKernel);
+	
+	if (m_data->m_clearOverlappingPairsKernel)
+		clReleaseKernel(m_data->m_clearOverlappingPairsKernel);
 	delete m_data->m_raycaster;
 	delete m_data->m_solver;
 	delete m_data->m_allAabbsGPU;
@@ -247,6 +259,7 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 				m_data->m_broadphaseDbvt->calculateOverlappingPairs();
 			}
 			numPairs = m_data->m_broadphaseDbvt->getOverlappingPairCache()->getNumOverlappingPairs();
+
 		} else
 		{
 			if (gUseCalculateOverlappingPairsHost)
@@ -284,6 +297,43 @@ void	b3GpuRigidBodyPipeline::stepSimulation(float deltaTime)
 			aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
 		}

+		m_data->m_overlappingPairsGPU->resize(numPairs);
+
+		//mark the contacts for each pair as 'unused'
+		if (numPairs)
+		{
+			b3OpenCLArray<b3BroadphasePair> gpuPairs(this->m_data->m_context,m_data->m_queue);
+			gpuPairs.setFromOpenCLBuffer(pairs,numPairs);
+
+			if (gClearPairsOnGpu)
+			{
+				
+
+				//b3AlignedObjectArray<b3BroadphasePair> hostPairs;//just for debugging
+				//gpuPairs.copyToHost(hostPairs);
+
+				b3LauncherCL launcher(m_data->m_queue,m_data->m_clearOverlappingPairsKernel,"clearOverlappingPairsKernel");
+				launcher.setBuffer(pairs);
+				launcher.setConst(numPairs);
+				launcher.launch1D(numPairs);
+
+
+				//gpuPairs.copyToHost(hostPairs);
+			
+
+			} else
+			{
+				b3AlignedObjectArray<b3BroadphasePair> hostPairs;
+				gpuPairs.copyToHost(hostPairs);
+
+				for (int i=0;i<hostPairs.size();i++)
+				{
+					hostPairs[i].z = 0xffffffff;
+				}
+
+				gpuPairs.copyFromHost(hostPairs);
+			}
+		}

 		m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies);
 		numContacts = m_data->m_narrowphase->getNumContactsGpu();
--- a/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
+++ b/src/Bullet3OpenCL/RigidBody/b3GpuRigidBodyPipelineInternalData.h
@@ -41,6 +41,7 @@ struct b3GpuRigidBodyPipelineInternalData

 	cl_kernel	m_integrateTransformsKernel;
 	cl_kernel	m_updateAabbsKernel;
+	cl_kernel	m_clearOverlappingPairsKernel;
 	
 	class b3PgsJacobiSolver* m_solver;
 	
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.h
@@ -24,6 +24,11 @@ static const char* batchingKernelsCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -60,6 +65,29 @@ static const char* batchingKernelsCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "typedef  struct b3Contact4Data b3Contact4Data_t;\n"
 "struct b3Contact4Data\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.h
@@ -24,6 +24,11 @@ static const char* batchingKernelsNewCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -60,6 +65,29 @@ static const char* batchingKernelsNewCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "typedef  struct b3Contact4Data b3Contact4Data_t;\n"
 "struct b3Contact4Data\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/integrateKernel.h
@@ -24,6 +24,11 @@ static const char* integrateKernelCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -60,6 +65,29 @@ static const char* integrateKernelCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "#ifndef B3_QUAT_H\n"
 "#define B3_QUAT_H\n"
@@ -89,6 +117,7 @@ static const char* integrateKernelCL= \
 "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
 "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
 "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
 "{\n"
 "	b3Quat ans;\n"
@@ -123,6 +152,10 @@ static const char* integrateKernelCL= \
 "	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
 "{\n"
 "	return (b3Quat)(-q.xyz, q.w);\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup.h
@@ -24,6 +24,11 @@ static const char* solverSetupCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -60,6 +65,29 @@ static const char* solverSetupCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "typedef  struct b3Contact4Data b3Contact4Data_t;\n"
 "struct b3Contact4Data\n"
@@ -151,6 +179,7 @@ static const char* solverSetupCL= \
 "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
 "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
 "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
 "{\n"
 "	b3Quat ans;\n"
@@ -185,6 +214,10 @@ static const char* solverSetupCL= \
 "	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
 "{\n"
 "	return (b3Quat)(-q.xyz, q.w);\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.h
@@ -24,6 +24,11 @@ static const char* solverSetup2CL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -60,6 +65,29 @@ static const char* solverSetup2CL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "typedef  struct b3Contact4Data b3Contact4Data_t;\n"
 "struct b3Contact4Data\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/solverUtils.h
@@ -24,6 +24,11 @@ static const char* solverUtilsCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -60,6 +65,29 @@ static const char* solverUtilsCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "typedef  struct b3Contact4Data b3Contact4Data_t;\n"
 "struct b3Contact4Data\n"
--- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl
+++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.cl
@@ -11,3 +11,12 @@ __kernel void initializeGpuAabbsFull(  const int numNodes, __global b3RigidBodyD
 		b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);
 	}
 }
+
+__kernel void clearOverlappingPairsKernel(  __global int4* pairs, int numPairs)
+{
+	int pairId = get_global_id(0);
+	if( pairId< numPairs )
+	{
+		pairs[pairId].z = 0xffffffff;
+	}
+}
--- a/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
+++ b/src/Bullet3OpenCL/RigidBody/kernels/updateAabbsKernel.h
@@ -14,6 +14,11 @@ static const char* updateAabbsKernelCL= \
 "};\n"
 "#ifdef __cplusplus\n"
 "#else\n"
+"//keep B3_LARGE_FLOAT*B3_LARGE_FLOAT < FLT_MAX\n"
+"#define B3_LARGE_FLOAT 1e18f\n"
+"#define B3_INFINITY 1e18f\n"
+"#define b3Assert(a)\n"
+"#define b3ConstArray(a) __global const a*\n"
 "#define b3AtomicInc atomic_inc\n"
 "#define b3AtomicAdd atomic_add\n"
 "#define b3Fabs fabs\n"
@@ -50,6 +55,29 @@ static const char* updateAabbsKernelCL= \
 "		return false;\n"
 "	return true;\n"
 "}\n"
+"inline int    b3MaxDot( b3Float4ConstArg vec, __global const b3Float4* vecArray, int vecLen, float* dotOut )\n"
+"{\n"
+"    float maxDot = -B3_INFINITY;\n"
+"    int i = 0;\n"
+"    int ptIndex = -1;\n"
+"    for( i = 0; i < vecLen; i++ )\n"
+"    {\n"
+"        float dot = b3Dot3F4(vecArray[i],vec);\n"
+"            \n"
+"        if( dot > maxDot )\n"
+"        {\n"
+"            maxDot = dot;\n"
+"            ptIndex = i;\n"
+"        }\n"
+"    }\n"
+"	b3Assert(ptIndex>=0);\n"
+"    if (ptIndex<0)\n"
+"	{\n"
+"		ptIndex = 0;\n"
+"	}\n"
+"    *dotOut = maxDot;\n"
+"    return ptIndex;\n"
+"}\n"
 "#endif //B3_FLOAT4_H\n"
 "#ifndef B3_MAT3x3_H\n"
 "#define B3_MAT3x3_H\n"
@@ -81,6 +109,7 @@ static const char* updateAabbsKernelCL= \
 "inline b3Quat b3QuatNormalized(b3QuatConstArg in);\n"
 "inline b3Quat b3QuatRotate(b3QuatConstArg q, b3QuatConstArg vec);\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q);\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q);\n"
 "inline b3Quat b3QuatMul(b3QuatConstArg a, b3QuatConstArg b)\n"
 "{\n"
 "	b3Quat ans;\n"
@@ -115,6 +144,10 @@ static const char* updateAabbsKernelCL= \
 "	float4 out = b3QuatMul(b3QuatMul(q,vcpy),qInv);\n"
 "	return out;\n"
 "}\n"
+"inline b3Quat b3QuatInverse(b3QuatConstArg q)\n"
+"{\n"
+"	return (b3Quat)(-q.xyz, q.w);\n"
+"}\n"
 "inline b3Quat b3QuatInvert(b3QuatConstArg q)\n"
 "{\n"
 "	return (b3Quat)(-q.xyz, q.w);\n"
@@ -422,4 +455,12 @@ static const char* updateAabbsKernelCL= \
 "		b3ComputeWorldAabb(nodeID, gBodies, collidables, plocalShapeAABB,pAABB);\n"
 "	}\n"
 "}\n"
+"__kernel void clearOverlappingPairsKernel(  __global int4* pairs, int numPairs)\n"
+"{\n"
+"	int pairId = get_global_id(0);\n"
+"	if( pairId< numPairs )\n"
+"	{\n"
+"		pairs[pairId].z = 0xffffffff;\n"
+"	}\n"
+"}\n"
 ;