diff --git a/Demos3/AllBullet2Demos/BulletDemoEntries.h b/Demos3/AllBullet2Demos/BulletDemoEntries.h
index 34df5728a..b2518b2e4 100644
--- a/Demos3/AllBullet2Demos/BulletDemoEntries.h
+++ b/Demos3/AllBullet2Demos/BulletDemoEntries.h
@@ -10,7 +10,7 @@
 
 #include "../bullet2/RagdollDemo/RagdollDemo.h"
 #include "../bullet2/LuaDemo/LuaDemo.h"
-
+#include "../bullet2/ChainDemo/ChainDemo.h"
 
 struct BulletDemoEntry
 {
@@ -24,6 +24,7 @@ static BulletDemoEntry allDemos[]=
 	
 	//{"emptydemo",EmptyBulletDemo::MyCreateFunc},
 	{"BasicDemo",BasicDemo::MyCreateFunc},
+	//{"ChainDemo",ChainDemo::MyCreateFunc},
 	{"HingeDemo",HingeDemo::MyCreateFunc},
 	{"Ragdoll",RagDollDemo::MyCreateFunc},
 	{"MultiBody1",FeatherstoneDemo1::MyCreateFunc},
diff --git a/Demos3/AllBullet2Demos/main.cpp b/Demos3/AllBullet2Demos/main.cpp
index d8d2907a4..0cf01f25e 100644
--- a/Demos3/AllBullet2Demos/main.cpp
+++ b/Demos3/AllBullet2Demos/main.cpp
@@ -14,6 +14,8 @@ static int sCurrentDemoIndex = 0;
 static BulletDemoInterface* sCurrentDemo = 0;
 static b3AlignedObjectArray<const char*> allNames;
 
+
+bool drawGUI=true;
 extern bool useShadowMap;
 static bool wireframe=false;
 static bool pauseSimulation=false;
@@ -128,8 +130,10 @@ void	MyComboBoxCallback(int comboId, const char* item)
 	
 }
 
+extern bool sOpenGLVerbose;
 int main(int argc, char* argv[])
 {
+	sOpenGLVerbose = false;
 	
 	float dt = 1./120.f;
 	int width = 1024;
@@ -175,12 +179,17 @@ int main(int argc, char* argv[])
 		app->m_instancingRenderer->updateCamera();
 		
 		app->drawGrid();
+		
+		if (0)
+		{
 		char bla[1024];
 		static int frameCount = 0;
 		frameCount++;
 		sprintf(bla,"Simple test frame %d", frameCount);
 		
 		app->drawText(bla,10,10);
+		}
+
 		if (sCurrentDemo)
 		{
 			if (!pauseSimulation)
diff --git a/Demos3/AllBullet2Demos/premake4.lua b/Demos3/AllBullet2Demos/premake4.lua
index dd0cd5a2d..508e4df8c 100644
--- a/Demos3/AllBullet2Demos/premake4.lua
+++ b/Demos3/AllBullet2Demos/premake4.lua
@@ -31,6 +31,9 @@
 		"../bullet2/BasicDemo/BasicDemo.h",
 		"../bullet2/BasicDemo/HingeDemo.cpp",
 		"../bullet2/BasicDemo/HingeDemo.h",
+		"../bullet2/BasicDemo/ChainDemo.cpp",
+		"../bullet2/BasicDemo/ChainDemo.h",
+		
 		"../bullet2/RagdollDemo/RagdollDemo.cpp",
 		"../bullet2/RagdollDemo/RagdollDemo.h",
 		"../bullet2/LuaDemo/LuaDemo.cpp",
diff --git a/Demos3/GpuDemos/broadphase/PairBench.cpp b/Demos3/GpuDemos/broadphase/PairBench.cpp
index 3a908a7eb..9c445417c 100644
--- a/Demos3/GpuDemos/broadphase/PairBench.cpp
+++ b/Demos3/GpuDemos/broadphase/PairBench.cpp
@@ -112,8 +112,7 @@ static BroadphaseEntry allBroadphases[]=
 	{"GPU Brute Force",b3GpuSapBroadphase::CreateFuncBruteForceGpu},
 	{"GPU 1-SAP Original",b3GpuSapBroadphase::CreateFuncOriginal},
 	{"GPU 1-SAP Barrier",b3GpuSapBroadphase::CreateFuncBarrier},
-	{"GPU 1-SAP LDS",b3GpuSapBroadphase::CreateFuncLocalMemory},
-	{"GPU 1-SAP LDS Batch",b3GpuSapBroadphase::CreateFuncLocalMemoryBatchWrite},
+	{"GPU 1-SAP LDS",b3GpuSapBroadphase::CreateFuncLocalMemory}
 };
 
 
diff --git a/Demos3/bullet2/ChainDemo/ChainDemo.cpp b/Demos3/bullet2/ChainDemo/ChainDemo.cpp
new file mode 100644
index 000000000..4c353a01b
--- /dev/null
+++ b/Demos3/bullet2/ChainDemo/ChainDemo.cpp
@@ -0,0 +1,201 @@
+#include "ChainDemo.h"
+#include "OpenGLWindow/SimpleOpenGL3App.h"
+#include "btBulletDynamicsCommon.h"
+#include "LinearMath/btVector3.h"
+
+#include "BulletDynamics/ConstraintSolver/btNNCGConstraintSolver.h"
+#include "BulletDynamics/MLCPSolvers/btDantzigSolver.h"
+#include "BulletDynamics/MLCPSolvers/btLemkeSolver.h"
+#include "BulletDynamics/MLCPSolvers/btSolveProjectedGaussSeidel.h"
+#include "BulletDynamics/MLCPSolvers/btMLCPSolver.h"
+
+
+#define NUM_SPHERES 10
+
+static const float scaling=0.35f;
+
+ChainDemo::ChainDemo(SimpleOpenGL3App* app)
+:Bullet2RigidBodyDemo(app)
+{
+}
+
+ChainDemo::~ChainDemo()
+{
+}
+
+void	ChainDemo::createGround(int cubeShapeId)
+{
+	{
+		float color[]={0.3,0.3,1,1};
+		float halfExtents[]={50,1,50,1};
+		btTransform groundTransform;
+		groundTransform.setIdentity();
+		groundTransform.setOrigin(btVector3(0,-5,0));
+		m_glApp->m_instancingRenderer->registerGraphicsInstance(cubeShapeId,groundTransform.getOrigin(),groundTransform.getRotation(),color,halfExtents);
+		btBoxShape* groundShape = new btBoxShape(btVector3(btScalar(halfExtents[0]),btScalar(halfExtents[1]),btScalar(halfExtents[2])));
+		//We can also use DemoApplication::localCreateRigidBody, but for clarity it is provided here:
+		{
+			btScalar mass(0.);
+			//rigidbody is dynamic if and only if mass is non zero, otherwise static
+			bool isDynamic = (mass != 0.f);
+			btVector3 localInertia(0,0,0);
+			if (isDynamic)
+				groundShape->calculateLocalInertia(mass,localInertia);
+			//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
+			btDefaultMotionState* myMotionState = new btDefaultMotionState(groundTransform);
+			btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,groundShape,localInertia);
+			btRigidBody* body = new btRigidBody(rbInfo);
+			//add the body to the dynamics world
+			m_dynamicsWorld->addRigidBody(body);
+			body->setActivationState(DISABLE_DEACTIVATION);
+		}
+	}
+}
+void	ChainDemo::initPhysics()
+{
+//	Bullet2RigidBodyDemo::initPhysics();
+
+	m_config = new btDefaultCollisionConfiguration;
+	m_dispatcher = new btCollisionDispatcher(m_config);
+	m_bp = new btDbvtBroadphase();
+	//m_solver = new btNNCGConstraintSolver();
+	m_solver = new btSequentialImpulseConstraintSolver();
+//	btDantzigSolver* mlcp = new btDantzigSolver();
+	//btLemkeSolver* mlcp = new btLemkeSolver();
+	//m_solver = new btMLCPSolver(mlcp);
+//	m_solver = new btSequentialImpulseConstraintSolver();
+	//btMultiBodyConstraintSolver* solver = new btMultiBodyConstraintSolver();
+	//m_solver = solver;
+
+	m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_bp,m_solver,m_config);
+	m_dynamicsWorld->getSolverInfo().m_numIterations = 1000;
+	m_dynamicsWorld->getSolverInfo().m_splitImpulse = false;
+
+	int curColor=0;
+	//create ground
+	btScalar radius=scaling;
+	int unitCubeShapeId = m_glApp->registerCubeShape();
+	
+	float pos[]={0,0,0};
+	float orn[]={0,0,0,1};
+		
+
+	//eateGround(unitCubeShapeId);
+	
+	int sphereShapeId = m_glApp->registerGraphicsSphereShape(radius,false);
+
+	{
+		float halfExtents[]={scaling,scaling,scaling,1};
+		btVector4 colors[4] =
+		{
+			btVector4(1,0,0,1),
+			btVector4(0,1,0,1),
+			btVector4(0,1,1,1),
+			btVector4(1,1,0,1),
+		};
+		
+
+
+		btTransform startTransform;
+		startTransform.setIdentity();
+		
+		
+		btCollisionShape* colShape = new btSphereShape(scaling);
+
+		btScalar largeMass[]={1000,10,100,1000};
+		for (int i=0;i<1;i++)
+		{
+
+			btAlignedObjectArray<btRigidBody*> bodies;
+			for (int k=0;k<NUM_SPHERES;k++)
+			{
+				btVector3 localInertia(0,0,0);
+				btScalar mass = 0.f;
+				curColor = 1;
+
+				switch (k)
+				{
+					case 0:
+						{
+							mass = largeMass[i];
+							curColor = 0;
+							break;
+						}
+					case NUM_SPHERES-1:
+					{
+						mass = 0.f;
+						curColor = 2;
+						break;
+					}
+					default:
+						{
+							curColor = 1;
+							mass = 1.f;
+						}
+				};
+		
+				if (mass)
+					colShape ->calculateLocalInertia(mass,localInertia);
+
+				btVector4 color = colors[curColor];
+			
+				startTransform.setOrigin(btVector3(
+									btScalar(7.5+-i*5),
+									btScalar(6.*scaling+2.0*scaling*k),
+									btScalar(0)));
+
+				m_glApp->m_instancingRenderer->registerGraphicsInstance(sphereShapeId,startTransform.getOrigin(),startTransform.getRotation(),color,halfExtents);
+			
+				//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
+				btDefaultMotionState* myMotionState = new btDefaultMotionState(startTransform);
+				btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,colShape,localInertia);
+				btRigidBody* body = new btRigidBody(rbInfo);
+				bodies.push_back(body);
+				body->setActivationState(DISABLE_DEACTIVATION);
+
+				m_dynamicsWorld->addRigidBody(body);
+			}
+
+			//add constraints
+			btVector3 pivotInA(0,radius,0);
+			btVector3 pivotInB(0,-radius,0);
+			for (int k=0;k<NUM_SPHERES-1;k++)
+			{
+				btPoint2PointConstraint* p2p = new btPoint2PointConstraint(*bodies[k],*bodies[k+1],pivotInA,pivotInB);
+				m_dynamicsWorld->addConstraint(p2p,true);
+			}
+		}
+	}
+
+	m_glApp->m_instancingRenderer->writeTransforms();
+}
+void	ChainDemo::exitPhysics()
+{
+	
+	Bullet2RigidBodyDemo::exitPhysics();
+}
+void	ChainDemo::renderScene()
+{
+	//sync graphics -> physics world transforms
+	{
+		for (int i=0;i<m_dynamicsWorld->getNumCollisionObjects();i++)
+		{
+			btVector3 pos = m_dynamicsWorld->getCollisionObjectArray()[i]->getWorldTransform().getOrigin();
+			btQuaternion orn = m_dynamicsWorld->getCollisionObjectArray()[i]->getWorldTransform().getRotation();
+			m_glApp->m_instancingRenderer->writeSingleInstanceTransformToCPU(pos,orn,i);
+		}
+		m_glApp->m_instancingRenderer->writeTransforms();
+	}
+
+	m_glApp->m_instancingRenderer->renderScene();
+}
+
+	
+void	ChainDemo::stepSimulation(float dt)
+{
+	m_dynamicsWorld->stepSimulation(dt,10,1./240.);
+	//m_dynamicsWorld->stepSimulation(dt,10,1./60.);
+}
+
+
+
diff --git a/Demos3/bullet2/ChainDemo/ChainDemo.h b/Demos3/bullet2/ChainDemo/ChainDemo.h
new file mode 100644
index 000000000..09f0b4707
--- /dev/null
+++ b/Demos3/bullet2/ChainDemo/ChainDemo.h
@@ -0,0 +1,31 @@
+#ifndef CHAIN_DEMO_H
+#define CHAIN_DEMO_H
+
+#include "LinearMath/btVector3.h"
+#include "../BasicDemo/Bullet2RigidBodyDemo.h"
+
+
+
+class ChainDemo : public Bullet2RigidBodyDemo
+{
+
+public:
+
+	static BulletDemoInterface* MyCreateFunc(SimpleOpenGL3App* app)
+	{
+		return new ChainDemo(app);
+	}
+
+	ChainDemo(SimpleOpenGL3App* app);
+	virtual ~ChainDemo();
+	
+	void	createGround(int cubeShapeId);
+
+	virtual void	initPhysics();
+	virtual void	exitPhysics();
+	virtual void	renderScene();
+	virtual void	stepSimulation(float dt);
+};
+
+
+#endif //CHAIN_DEMO_H
diff --git a/Demos3/bullet2/FeatherstoneMultiBodyDemo/BulletMultiBodyDemos.cpp b/Demos3/bullet2/FeatherstoneMultiBodyDemo/BulletMultiBodyDemos.cpp
index bce4ad80a..8158787f3 100644
--- a/Demos3/bullet2/FeatherstoneMultiBodyDemo/BulletMultiBodyDemos.cpp
+++ b/Demos3/bullet2/FeatherstoneMultiBodyDemo/BulletMultiBodyDemos.cpp
@@ -29,6 +29,7 @@ static float friction = 1.;
 #include "BulletCollision/CollisionShapes/btShapeHull.h"
 
 #define CONSTRAINT_DEBUG_SIZE 0.2f
+static bool prevCanSleep = false;
 
 struct GraphicsVertex
 {
@@ -243,6 +244,8 @@ bool	Bullet2MultiBodyDemo::mouseButtonCallback(int button, int state, float x, f
 					btMultiBodyLinkCollider* multiCol = (btMultiBodyLinkCollider*)btMultiBodyLinkCollider::upcast(rayCallback.m_collisionObject);
 					if (multiCol && multiCol->m_multiBody)
 					{
+						
+						prevCanSleep = multiCol->m_multiBody->getCanSleep();
 						multiCol->m_multiBody->setCanSleep(false);
 
 						btVector3 pivotInA = multiCol->m_multiBody->worldPosToLocal(multiCol->m_link, pickPos);
@@ -285,7 +288,7 @@ bool	Bullet2MultiBodyDemo::mouseButtonCallback(int button, int state, float x, f
 
 			if (m_pickingMultiBodyPoint2Point)
 			{
-				m_pickingMultiBodyPoint2Point->getMultiBodyA()->setCanSleep(true);
+				m_pickingMultiBodyPoint2Point->getMultiBodyA()->setCanSleep(prevCanSleep);
 				btMultiBodyDynamicsWorld* world = (btMultiBodyDynamicsWorld*) m_dynamicsWorld;
 				world->removeMultiBodyConstraint(m_pickingMultiBodyPoint2Point);
 				delete m_pickingMultiBodyPoint2Point;
diff --git a/btgui/OpenGLWindow/Win32OpenGLWindow.cpp b/btgui/OpenGLWindow/Win32OpenGLWindow.cpp
index 2a3928699..846c79a87 100644
--- a/btgui/OpenGLWindow/Win32OpenGLWindow.cpp
+++ b/btgui/OpenGLWindow/Win32OpenGLWindow.cpp
@@ -28,7 +28,7 @@ static void printGLString(const char *name, GLenum s) {
   printf("%s = %s\n",name, v);
 }
 
-
+bool sOpenGLVerbose = true;
 
 void Win32OpenGLWindow::enableOpenGL()
 {
@@ -56,9 +56,12 @@ void Win32OpenGLWindow::enableOpenGL()
 	m_data->m_hRC = wglCreateContext( m_data->m_hDC );
 	wglMakeCurrent( m_data->m_hDC, m_data->m_hRC );
 
-	 printGLString("Version", GL_VERSION);
-    printGLString("Vendor", GL_VENDOR);
-    printGLString("Renderer", GL_RENDERER);
+	if (sOpenGLVerbose)
+	{
+		 printGLString("Version", GL_VERSION);
+		printGLString("Vendor", GL_VENDOR);
+		printGLString("Renderer", GL_RENDERER);
+	}
     //printGLString("Extensions", GL_EXTENSIONS);
 
 }
diff --git a/data/init_physics.lua b/data/init_physics.lua
index eec83f143..12ff73f07 100644
--- a/data/init_physics.lua
+++ b/data/init_physics.lua
@@ -21,7 +21,7 @@ maxy = 10
 toggle=1
 
 for x=0,10 do
-	for y=0,10 do
+	for y=0,5 do
 		if toggle==1 then
 			toggle = 0
 			for z=0,10 do
@@ -32,8 +32,8 @@ for x=0,10 do
 				pos = {-14+x*2,2+2*y,z*2}
 			
 				body = createRigidBody(world,shape,mass,pos,orn)
-				setBodyPosition(world,body,pos)
-				setBodyOrientation(world,body,orn)
+				--setBodyPosition(world,body,pos)
+				--setBodyOrientation(world,body,orn)
 			end
 		else
 			toggle = 1
@@ -46,7 +46,7 @@ toggle=1
 shape = createSphereShape(world, 1)
 
 for x=0,10 do
-	for y=0,20 do
+	for y=0,5 do
 		if toggle==1 then
 			toggle = 0
 		else
@@ -62,8 +62,6 @@ for x=0,10 do
 				pos = {-14+x*2,2+2*y,z*2}
 			
 				body = createRigidBody(world,shape,mass,pos,orn)
-				setBodyPosition(world,body,pos)
-				setBodyOrientation(world,body,orn)
 			end
 		end
 	end
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
index 00600ee08..529951af0 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
@@ -9,11 +9,11 @@ bool searchIncremental3dSapOnGpu = true;
 
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "kernels/sapKernels.h"
-#include "kernels/sapFastKernels.h"
+
 #include "Bullet3Common/b3MinMax.h"
 
 #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
-#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
+
 
 b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q , b3GpuSapKernelType kernelType)
 :m_context(ctx),
@@ -48,7 +48,7 @@ m_addedCountGPU(ctx,q),
 m_removedCountGPU(ctx,q)
 {
 	const char* sapSrc = sapCL;
-    const char* sapFastSrc = sapFastCL;
+    
     
 	cl_int errNum=0;
 
@@ -56,8 +56,8 @@ m_removedCountGPU(ctx,q)
 	b3Assert(m_device);
 	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
 	b3Assert(errNum==CL_SUCCESS);
-	cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
-	//cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,0,&errNum,"",B3_BROADPHASE_SAPFAST_PATH,true);
+
+
 	b3Assert(errNum==CL_SUCCESS);
 #ifndef __APPLE__
 	m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue);
@@ -95,11 +95,6 @@ m_removedCountGPU(ctx,q)
 			break;
 		}
 
-		case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE:
-		{
-			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
-			break;
-		}
 		default:
 		{
 			m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
@@ -115,24 +110,7 @@ m_removedCountGPU(ctx,q)
 	m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg );
 	b3Assert(errNum==CL_SUCCESS);
 
-	m_computePairsIncremental3dSapKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsIncremental3dSapKernel",&errNum,sapFastProg );
-	b3Assert(errNum==CL_SUCCESS);
 		
-	/*
-#if 0
-
-	m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
-	b3Assert(errNum==CL_SUCCESS);
-#else
-#ifndef __APPLE__
-	m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
-	b3Assert(errNum==CL_SUCCESS);
-#else
-	m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
-	b3Assert(errNum==CL_SUCCESS);
-#endif
-#endif
-	*/
 	m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
 
 	m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
@@ -153,7 +131,7 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase()
 	clReleaseKernel(m_sapKernel);
 	clReleaseKernel(m_sap2Kernel);
 	clReleaseKernel(m_prepareSumVarianceKernel);
-	clReleaseKernel(m_computePairsIncremental3dSapKernel);
+	
 
 }
 
@@ -469,7 +447,7 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
 	int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size();
 	b3Assert(a==b);
 	b3Assert(b==c);
-	
+	/*
 	if (searchIncremental3dSapOnGpu)
 	{
 		B3_PROFILE("computePairsIncremental3dSapKernelGPU");
@@ -547,6 +525,7 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
 
 	} 
 	else
+	*/
 	{
 		int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
 
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
index be014bb44..2d3d39367 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@@ -24,7 +24,7 @@ class b3GpuSapBroadphase : public b3GpuBroadphaseInterface
 	cl_kernel				m_sapKernel;
 	cl_kernel				m_sap2Kernel;
 	cl_kernel				m_prepareSumVarianceKernel;
-	cl_kernel				m_computePairsIncremental3dSapKernel;
+	
 
 	class b3RadixSort32CL* m_sorter;
 
@@ -95,8 +95,7 @@ public:
 		B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU,
 		B3_GPU_SAP_KERNEL_ORIGINAL,
 		B3_GPU_SAP_KERNEL_BARRIER,
-		B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY,
-		B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE
+		B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY
 	};
 
 	b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q , b3GpuSapKernelType kernelType=B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
@@ -124,10 +123,7 @@ public:
 	{
 		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY);
 	}
-	static b3GpuBroadphaseInterface* CreateFuncLocalMemoryBatchWrite(cl_context ctx,cl_device_id device, cl_command_queue  q)
-	{
-		return new b3GpuSapBroadphase(ctx,device,q,B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY_BATCH_WRITE);
-	}
+	
 
 	virtual void  calculateOverlappingPairs(int maxPairs);
 	virtual void  calculateOverlappingPairsHost(int maxPairs);
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
deleted file mode 100644
index 21244b1a9..000000000
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
+++ /dev/null
@@ -1,453 +0,0 @@
-/*
-Copyright (c) 2012 Advanced Micro Devices, Inc.  
-
-This software is provided 'as-is', without any express or implied warranty.
-In no event will the authors be held liable for any damages arising from the use of this software.
-Permission is granted to anyone to use this software for any purpose, 
-including commercial applications, and to alter it and redistribute it freely, 
-subject to the following restrictions:
-
-1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
-2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
-3. This notice may not be removed or altered from any source distribution.
-*/
-//Originally written by Erwin Coumans
-
-#define NEW_PAIR_MARKER -1
-#define REMOVED_PAIR_MARKER -2
-
-typedef struct 
-{
-	union
-	{
-		float4	m_min;
-		float   m_minElems[4];
-		int			m_minIndices[4];
-	};
-	union
-	{
-		float4	m_max;
-		float   m_maxElems[4];
-		int			m_maxIndices[4];
-	};
-} btAabbCL;
-
-typedef struct 
-{
-	union
-	{
-		unsigned int m_key;
-		unsigned int x;
-	};
-
-	union
-	{
-		unsigned int m_value;
-		unsigned int y;
-		
-	};
-}b3SortData;
-
-
-/// conservative test for overlap between two aabbs
-bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
-bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
-{
-//skip pairs between static (mass=0) objects
-	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
-		return false;
-		
-	bool overlap = true;
-	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
-	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
-	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
-	return overlap;
-}
-
-__kernel void   computePairsIncremental3dSapKernel( __global const uint2* objectMinMaxIndexGPUaxis0,
-													__global const uint2* objectMinMaxIndexGPUaxis1,
-													__global const uint2* objectMinMaxIndexGPUaxis2,
-													__global const uint2* objectMinMaxIndexGPUaxis0prev,
-													__global const uint2* objectMinMaxIndexGPUaxis1prev,
-													__global const uint2* objectMinMaxIndexGPUaxis2prev,
-													__global const b3SortData*	   sortedAxisGPU0,
-													__global const b3SortData*	   sortedAxisGPU1,
-													__global const b3SortData*	   sortedAxisGPU2,
-													__global const b3SortData*	   sortedAxisGPU0prev,
-													__global const b3SortData*	   sortedAxisGPU1prev,
-													__global const b3SortData*	   sortedAxisGPU2prev,
-													__global int4*			addedHostPairsGPU,
-													__global int4*			removedHostPairsGPU,
-													volatile __global int*				addedHostPairsCount,
-													volatile __global int*				removedHostPairsCount,
-													int maxCapacity,
-													int numObjects)
-{
-	int i = get_global_id(0);
-	if (i>=numObjects)
-		return;
-
-	__global const uint2* objectMinMaxIndexGPU[3][2];
-	objectMinMaxIndexGPU[0][0]=objectMinMaxIndexGPUaxis0;
-	objectMinMaxIndexGPU[1][0]=objectMinMaxIndexGPUaxis1;
-	objectMinMaxIndexGPU[2][0]=objectMinMaxIndexGPUaxis2;
-	objectMinMaxIndexGPU[0][1]=objectMinMaxIndexGPUaxis0prev;
-	objectMinMaxIndexGPU[1][1]=objectMinMaxIndexGPUaxis1prev;
-	objectMinMaxIndexGPU[2][1]=objectMinMaxIndexGPUaxis2prev;
-
-	__global const b3SortData* sortedAxisGPU[3][2];
-	sortedAxisGPU[0][0] = sortedAxisGPU0;
-	sortedAxisGPU[1][0] = sortedAxisGPU1;
-	sortedAxisGPU[2][0] = sortedAxisGPU2;
-	sortedAxisGPU[0][1] = sortedAxisGPU0prev;
-	sortedAxisGPU[1][1] = sortedAxisGPU1prev;
-	sortedAxisGPU[2][1] = sortedAxisGPU2prev;
-
-	int m_currentBuffer = 0;
-
-	for (int axis=0;axis<3;axis++)
-	{
-		//int i = checkObjects[a];
-
-		unsigned int curMinIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].x;
-		unsigned int curMaxIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].y;
-		unsigned int prevMinIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].x;
-		int dmin = curMinIndex - prevMinIndex;
-				
-		unsigned int prevMaxIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].y;
-
-		int dmax = curMaxIndex - prevMaxIndex;
-	
-		for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)
-		{
-			if (dmin!=0)
-			{
-				int stepMin = dmin<0 ? -1 : 1;
-				for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)
-				{
-					int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;
-					int otherIndex = otherIndex2/2;
-					if (otherIndex!=i)
-					{
-						bool otherIsMax = ((otherIndex2&1)!=0);
-
-						if (otherIsMax)
-						{
-									
-							bool overlap = true;
-
-							for (int ax=0;ax<3;ax++)
-							{
-								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||
-									(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))
-									overlap=false;
-							}
-
-						//	b3Assert(overlap2==overlap);
-
-							bool prevOverlap = true;
-
-							for (int ax=0;ax<3;ax++)
-							{
-								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||
-									(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))
-									prevOverlap=false;
-							}
-									
-
-							//b3Assert(overlap==overlap2);
-								
-
-
-							if (dmin<0)
-							{
-								if (overlap && !prevOverlap)
-								{
-									//add a pair
-									int4 newPair;
-									if (i<=otherIndex)
-									{
-										newPair.x = i;
-										newPair.y = otherIndex;
-									} else
-									{
-										newPair.x = otherIndex;
-										newPair.y = i;
-									}
-									
-									{
-										int curPair = atomic_inc(addedHostPairsCount);
-										if (curPair<maxCapacity)
-										{
-											addedHostPairsGPU[curPair].x = newPair.x;
-											addedHostPairsGPU[curPair].y = newPair.y;
-											addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;
-											addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;
-
-										}
-									}
-
-								}
-							} 
-							else
-							{
-								if (!overlap && prevOverlap)
-								{
-									
-									//remove a pair
-									int4 removedPair;
-									if (i<=otherIndex)
-									{
-										removedPair.x = i;
-										removedPair.y = otherIndex;
-									} else
-									{
-										removedPair.x = otherIndex;
-										removedPair.y = i;
-									}
-									{
-										int curPair = atomic_inc(removedHostPairsCount);
-										if (curPair<maxCapacity)
-										{
-											
-											removedHostPairsGPU[curPair].x = removedPair.x;
-											removedHostPairsGPU[curPair].y = removedPair.y;
-											removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;
-											removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;
-
-										}
-									}
-								}
-							}//otherisMax
-						}//if (dmin<0)
-					}//if (otherIndex!=i)
-				}//for (int j=
-			}
-				
-			if (dmax!=0)
-			{
-				int stepMax = dmax<0 ? -1 : 1;
-				for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)
-				{
-					int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;
-					int otherIndex = otherIndex2/2;
-					if (otherIndex!=i)
-					{
-						bool otherIsMin = ((otherIndex2&1)==0);
-						if (otherIsMin)
-						{
-									
-							bool overlap = true;
-
-							for (int ax=0;ax<3;ax++)
-							{
-								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||
-									(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))
-									overlap=false;
-							}
-							//b3Assert(overlap2==overlap);
-
-							bool prevOverlap = true;
-
-							for (int ax=0;ax<3;ax++)
-							{
-								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||
-									(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))
-									prevOverlap=false;
-							}
-									
-
-							if (dmax>0)
-							{
-								if (overlap && !prevOverlap)
-								{
-									//add a pair
-									int4 newPair;
-									if (i<=otherIndex)
-									{
-										newPair.x = i;
-										newPair.y = otherIndex;
-									} else
-									{
-										newPair.x = otherIndex;
-										newPair.y = i;
-									}
-									{
-										int curPair = atomic_inc(addedHostPairsCount);
-										if (curPair<maxCapacity)
-										{
-											
-											addedHostPairsGPU[curPair].x = newPair.x;
-											addedHostPairsGPU[curPair].y = newPair.y;
-											addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;
-											addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;
-
-										}
-									}
-							
-								}
-							} 
-							else
-							{
-								if (!overlap && prevOverlap)
-								{
-									//if (otherIndex2&1==0) -> min?
-									//remove a pair
-									int4 removedPair;
-									if (i<=otherIndex)
-									{
-										removedPair.x = i;
-										removedPair.y = otherIndex;
-									} else
-									{
-										removedPair.x = otherIndex;
-										removedPair.y = i;
-									}
-									{
-										int curPair = atomic_inc(removedHostPairsCount);
-										if (curPair<maxCapacity)
-										{
-											
-											removedHostPairsGPU[curPair].x = removedPair.x;
-											removedHostPairsGPU[curPair].y = removedPair.y;
-											removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;
-											removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;
-										}
-									}
-								
-								}
-							}
-							
-						}//if (dmin<0)
-					}//if (otherIndex!=i)
-				}//for (int j=
-			}
-		}//for (int otherbuffer
-	}//for (int axis=0;
-
-
-}
-
-
-__kernel void   computePairsKernelLocalSharedMemoryBatchWrite( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
-{
-	int i = get_global_id(0);
-	int localId = get_local_id(0);
-
-	__local int numActiveWgItems[1];
-	__local int breakRequest[1];
-	__local btAabbCL localAabbs[128];// = aabbs[i];
-	
-	int2 myPairs[64];
-	
-	btAabbCL myAabb;
-	
-	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
-	float testValue = 	myAabb.m_maxElems[axis];
-	
-	if (localId==0)
-	{
-		numActiveWgItems[0] = 0;
-		breakRequest[0] = 0;
-	}
-	int localCount=0;
-	int block=0;
-	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
-	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
-	
-	barrier(CLK_LOCAL_MEM_FENCE);
-	atomic_inc(numActiveWgItems);
-	barrier(CLK_LOCAL_MEM_FENCE);
-	int localBreak = 0;
-	int curNumPairs = 0;
-	
-	int j=i+1;
-	do
-	{
-		barrier(CLK_LOCAL_MEM_FENCE);
-	
-		if (j<numObjects)
-		{
-	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
-			{
-				if (!localBreak)
-				{
-					atomic_inc(breakRequest);
-					localBreak = 1;
-				}
-			}
-		}
-		
-		barrier(CLK_LOCAL_MEM_FENCE);
-		
-		if (j>=numObjects && !localBreak)
-		{
-			atomic_inc(breakRequest);
-			localBreak = 1;
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-		
-		if (!localBreak)
-		{
-			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
-			{
-				int2 myPair;
-				myPair.x = myAabb.m_minIndices[3];
-				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
-				myPairs[curNumPairs] = myPair;
-				curNumPairs++;
-				if (curNumPairs==64)
-				{
-					int curPair = atomic_add(pairCount,curNumPairs);
-					for (int p=0;p<curNumPairs;p++)
-					{
-						if ((curPair+p)<maxPairs)
-						{
-							int4 tmpPair;
-							tmpPair.x = myPairs[p].x;
-							tmpPair.y = myPairs[p].y;
-							tmpPair.z = NEW_PAIR_MARKER;
-							tmpPair.w = NEW_PAIR_MARKER;
-							
-
-							pairsOut[curPair+p] = tmpPair; //flush to main memory
-						}
-					}
-					curNumPairs = 0;
-				}
-			}
-		}
-		barrier(CLK_LOCAL_MEM_FENCE);
-		
-		localCount++;
-		if (localCount==64)
-		{
-			localCount = 0;
-			block+=64;			
-			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
-			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
-		}
-		j++;
-		
-	} while (breakRequest[0]<numActiveWgItems[0]);
-	
-	
-	if (curNumPairs>0)
-	{
-		//avoid a buffer overrun
-		int curPair = atomic_add(pairCount,curNumPairs);
-		for (int p=0;p<curNumPairs;p++)
-		{
-				if ((curPair+p)<maxPairs)
-				{
-					int4 tmpPair;
-					tmpPair.x = myPairs[p].x;
-					tmpPair.y = myPairs[p].y;
-					tmpPair.z = NEW_PAIR_MARKER;
-					tmpPair.w = NEW_PAIR_MARKER;
-					pairsOut[curPair+p] = tmpPair; //flush to main memory
-				}
-		}
-		curNumPairs = 0;
-	}
-}
\ No newline at end of file
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
deleted file mode 100644
index a22731cc9..000000000
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
+++ /dev/null
@@ -1,419 +0,0 @@
-//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
-static const char* sapFastCL= \
-"/*\n"
-"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
-"This software is provided 'as-is', without any express or implied warranty.\n"
-"In no event will the authors be held liable for any damages arising from the use of this software.\n"
-"Permission is granted to anyone to use this software for any purpose, \n"
-"including commercial applications, and to alter it and redistribute it freely, \n"
-"subject to the following restrictions:\n"
-"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
-"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
-"3. This notice may not be removed or altered from any source distribution.\n"
-"*/\n"
-"//Originally written by Erwin Coumans\n"
-"#define NEW_PAIR_MARKER -1\n"
-"#define REMOVED_PAIR_MARKER -2\n"
-"typedef struct \n"
-"{\n"
-"	union\n"
-"	{\n"
-"		float4	m_min;\n"
-"		float   m_minElems[4];\n"
-"		int			m_minIndices[4];\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		float4	m_max;\n"
-"		float   m_maxElems[4];\n"
-"		int			m_maxIndices[4];\n"
-"	};\n"
-"} btAabbCL;\n"
-"typedef struct \n"
-"{\n"
-"	union\n"
-"	{\n"
-"		unsigned int m_key;\n"
-"		unsigned int x;\n"
-"	};\n"
-"	union\n"
-"	{\n"
-"		unsigned int m_value;\n"
-"		unsigned int y;\n"
-"		\n"
-"	};\n"
-"}b3SortData;\n"
-"/// conservative test for overlap between two aabbs\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
-"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
-"{\n"
-"//skip pairs between static (mass=0) objects\n"
-"	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
-"		return false;\n"
-"		\n"
-"	bool overlap = true;\n"
-"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
-"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
-"	return overlap;\n"
-"}\n"
-"__kernel void   computePairsIncremental3dSapKernel( __global const uint2* objectMinMaxIndexGPUaxis0,\n"
-"													__global const uint2* objectMinMaxIndexGPUaxis1,\n"
-"													__global const uint2* objectMinMaxIndexGPUaxis2,\n"
-"													__global const uint2* objectMinMaxIndexGPUaxis0prev,\n"
-"													__global const uint2* objectMinMaxIndexGPUaxis1prev,\n"
-"													__global const uint2* objectMinMaxIndexGPUaxis2prev,\n"
-"													__global const b3SortData*	   sortedAxisGPU0,\n"
-"													__global const b3SortData*	   sortedAxisGPU1,\n"
-"													__global const b3SortData*	   sortedAxisGPU2,\n"
-"													__global const b3SortData*	   sortedAxisGPU0prev,\n"
-"													__global const b3SortData*	   sortedAxisGPU1prev,\n"
-"													__global const b3SortData*	   sortedAxisGPU2prev,\n"
-"													__global int4*			addedHostPairsGPU,\n"
-"													__global int4*			removedHostPairsGPU,\n"
-"													volatile __global int*				addedHostPairsCount,\n"
-"													volatile __global int*				removedHostPairsCount,\n"
-"													int maxCapacity,\n"
-"													int numObjects)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	if (i>=numObjects)\n"
-"		return;\n"
-"	__global const uint2* objectMinMaxIndexGPU[3][2];\n"
-"	objectMinMaxIndexGPU[0][0]=objectMinMaxIndexGPUaxis0;\n"
-"	objectMinMaxIndexGPU[1][0]=objectMinMaxIndexGPUaxis1;\n"
-"	objectMinMaxIndexGPU[2][0]=objectMinMaxIndexGPUaxis2;\n"
-"	objectMinMaxIndexGPU[0][1]=objectMinMaxIndexGPUaxis0prev;\n"
-"	objectMinMaxIndexGPU[1][1]=objectMinMaxIndexGPUaxis1prev;\n"
-"	objectMinMaxIndexGPU[2][1]=objectMinMaxIndexGPUaxis2prev;\n"
-"	__global const b3SortData* sortedAxisGPU[3][2];\n"
-"	sortedAxisGPU[0][0] = sortedAxisGPU0;\n"
-"	sortedAxisGPU[1][0] = sortedAxisGPU1;\n"
-"	sortedAxisGPU[2][0] = sortedAxisGPU2;\n"
-"	sortedAxisGPU[0][1] = sortedAxisGPU0prev;\n"
-"	sortedAxisGPU[1][1] = sortedAxisGPU1prev;\n"
-"	sortedAxisGPU[2][1] = sortedAxisGPU2prev;\n"
-"	int m_currentBuffer = 0;\n"
-"	for (int axis=0;axis<3;axis++)\n"
-"	{\n"
-"		//int i = checkObjects[a];\n"
-"		unsigned int curMinIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].x;\n"
-"		unsigned int curMaxIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].y;\n"
-"		unsigned int prevMinIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].x;\n"
-"		int dmin = curMinIndex - prevMinIndex;\n"
-"				\n"
-"		unsigned int prevMaxIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].y;\n"
-"		int dmax = curMaxIndex - prevMaxIndex;\n"
-"	\n"
-"		for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)\n"
-"		{\n"
-"			if (dmin!=0)\n"
-"			{\n"
-"				int stepMin = dmin<0 ? -1 : 1;\n"
-"				for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)\n"
-"				{\n"
-"					int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;\n"
-"					int otherIndex = otherIndex2/2;\n"
-"					if (otherIndex!=i)\n"
-"					{\n"
-"						bool otherIsMax = ((otherIndex2&1)!=0);\n"
-"						if (otherIsMax)\n"
-"						{\n"
-"									\n"
-"							bool overlap = true;\n"
-"							for (int ax=0;ax<3;ax++)\n"
-"							{\n"
-"								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||\n"
-"									(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))\n"
-"									overlap=false;\n"
-"							}\n"
-"						//	b3Assert(overlap2==overlap);\n"
-"							bool prevOverlap = true;\n"
-"							for (int ax=0;ax<3;ax++)\n"
-"							{\n"
-"								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||\n"
-"									(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))\n"
-"									prevOverlap=false;\n"
-"							}\n"
-"									\n"
-"							//b3Assert(overlap==overlap2);\n"
-"								\n"
-"							if (dmin<0)\n"
-"							{\n"
-"								if (overlap && !prevOverlap)\n"
-"								{\n"
-"									//add a pair\n"
-"									int4 newPair;\n"
-"									if (i<=otherIndex)\n"
-"									{\n"
-"										newPair.x = i;\n"
-"										newPair.y = otherIndex;\n"
-"									} else\n"
-"									{\n"
-"										newPair.x = otherIndex;\n"
-"										newPair.y = i;\n"
-"									}\n"
-"									\n"
-"									{\n"
-"										int curPair = atomic_inc(addedHostPairsCount);\n"
-"										if (curPair<maxCapacity)\n"
-"										{\n"
-"											addedHostPairsGPU[curPair].x = newPair.x;\n"
-"											addedHostPairsGPU[curPair].y = newPair.y;\n"
-"											addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;\n"
-"											addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;\n"
-"										}\n"
-"									}\n"
-"								}\n"
-"							} \n"
-"							else\n"
-"							{\n"
-"								if (!overlap && prevOverlap)\n"
-"								{\n"
-"									\n"
-"									//remove a pair\n"
-"									int4 removedPair;\n"
-"									if (i<=otherIndex)\n"
-"									{\n"
-"										removedPair.x = i;\n"
-"										removedPair.y = otherIndex;\n"
-"									} else\n"
-"									{\n"
-"										removedPair.x = otherIndex;\n"
-"										removedPair.y = i;\n"
-"									}\n"
-"									{\n"
-"										int curPair = atomic_inc(removedHostPairsCount);\n"
-"										if (curPair<maxCapacity)\n"
-"										{\n"
-"											\n"
-"											removedHostPairsGPU[curPair].x = removedPair.x;\n"
-"											removedHostPairsGPU[curPair].y = removedPair.y;\n"
-"											removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;\n"
-"											removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;\n"
-"										}\n"
-"									}\n"
-"								}\n"
-"							}//otherisMax\n"
-"						}//if (dmin<0)\n"
-"					}//if (otherIndex!=i)\n"
-"				}//for (int j=\n"
-"			}\n"
-"				\n"
-"			if (dmax!=0)\n"
-"			{\n"
-"				int stepMax = dmax<0 ? -1 : 1;\n"
-"				for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)\n"
-"				{\n"
-"					int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;\n"
-"					int otherIndex = otherIndex2/2;\n"
-"					if (otherIndex!=i)\n"
-"					{\n"
-"						bool otherIsMin = ((otherIndex2&1)==0);\n"
-"						if (otherIsMin)\n"
-"						{\n"
-"									\n"
-"							bool overlap = true;\n"
-"							for (int ax=0;ax<3;ax++)\n"
-"							{\n"
-"								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||\n"
-"									(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))\n"
-"									overlap=false;\n"
-"							}\n"
-"							//b3Assert(overlap2==overlap);\n"
-"							bool prevOverlap = true;\n"
-"							for (int ax=0;ax<3;ax++)\n"
-"							{\n"
-"								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||\n"
-"									(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))\n"
-"									prevOverlap=false;\n"
-"							}\n"
-"									\n"
-"							if (dmax>0)\n"
-"							{\n"
-"								if (overlap && !prevOverlap)\n"
-"								{\n"
-"									//add a pair\n"
-"									int4 newPair;\n"
-"									if (i<=otherIndex)\n"
-"									{\n"
-"										newPair.x = i;\n"
-"										newPair.y = otherIndex;\n"
-"									} else\n"
-"									{\n"
-"										newPair.x = otherIndex;\n"
-"										newPair.y = i;\n"
-"									}\n"
-"									{\n"
-"										int curPair = atomic_inc(addedHostPairsCount);\n"
-"										if (curPair<maxCapacity)\n"
-"										{\n"
-"											\n"
-"											addedHostPairsGPU[curPair].x = newPair.x;\n"
-"											addedHostPairsGPU[curPair].y = newPair.y;\n"
-"											addedHostPairsGPU[curPair].z = NEW_PAIR_MARKER;\n"
-"											addedHostPairsGPU[curPair].w = NEW_PAIR_MARKER;\n"
-"										}\n"
-"									}\n"
-"							\n"
-"								}\n"
-"							} \n"
-"							else\n"
-"							{\n"
-"								if (!overlap && prevOverlap)\n"
-"								{\n"
-"									//if (otherIndex2&1==0) -> min?\n"
-"									//remove a pair\n"
-"									int4 removedPair;\n"
-"									if (i<=otherIndex)\n"
-"									{\n"
-"										removedPair.x = i;\n"
-"										removedPair.y = otherIndex;\n"
-"									} else\n"
-"									{\n"
-"										removedPair.x = otherIndex;\n"
-"										removedPair.y = i;\n"
-"									}\n"
-"									{\n"
-"										int curPair = atomic_inc(removedHostPairsCount);\n"
-"										if (curPair<maxCapacity)\n"
-"										{\n"
-"											\n"
-"											removedHostPairsGPU[curPair].x = removedPair.x;\n"
-"											removedHostPairsGPU[curPair].y = removedPair.y;\n"
-"											removedHostPairsGPU[curPair].z = REMOVED_PAIR_MARKER;\n"
-"											removedHostPairsGPU[curPair].w = REMOVED_PAIR_MARKER;\n"
-"										}\n"
-"									}\n"
-"								\n"
-"								}\n"
-"							}\n"
-"							\n"
-"						}//if (dmin<0)\n"
-"					}//if (otherIndex!=i)\n"
-"				}//for (int j=\n"
-"			}\n"
-"		}//for (int otherbuffer\n"
-"	}//for (int axis=0;\n"
-"}\n"
-"__kernel void   computePairsKernelLocalSharedMemoryBatchWrite( __global const btAabbCL* aabbs, volatile __global int4* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
-"{\n"
-"	int i = get_global_id(0);\n"
-"	int localId = get_local_id(0);\n"
-"	__local int numActiveWgItems[1];\n"
-"	__local int breakRequest[1];\n"
-"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
-"	\n"
-"	int2 myPairs[64];\n"
-"	\n"
-"	btAabbCL myAabb;\n"
-"	\n"
-"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
-"	float testValue = 	myAabb.m_maxElems[axis];\n"
-"	\n"
-"	if (localId==0)\n"
-"	{\n"
-"		numActiveWgItems[0] = 0;\n"
-"		breakRequest[0] = 0;\n"
-"	}\n"
-"	int localCount=0;\n"
-"	int block=0;\n"
-"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
-"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
-"	\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	atomic_inc(numActiveWgItems);\n"
-"	barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	int localBreak = 0;\n"
-"	int curNumPairs = 0;\n"
-"	\n"
-"	int j=i+1;\n"
-"	do\n"
-"	{\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"	\n"
-"		if (j<numObjects)\n"
-"		{\n"
-"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
-"			{\n"
-"				if (!localBreak)\n"
-"				{\n"
-"					atomic_inc(breakRequest);\n"
-"					localBreak = 1;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (j>=numObjects && !localBreak)\n"
-"		{\n"
-"			atomic_inc(breakRequest);\n"
-"			localBreak = 1;\n"
-"		}\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		if (!localBreak)\n"
-"		{\n"
-"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
-"			{\n"
-"				int2 myPair;\n"
-"				myPair.x = myAabb.m_minIndices[3];\n"
-"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
-"				myPairs[curNumPairs] = myPair;\n"
-"				curNumPairs++;\n"
-"				if (curNumPairs==64)\n"
-"				{\n"
-"					int curPair = atomic_add(pairCount,curNumPairs);\n"
-"					for (int p=0;p<curNumPairs;p++)\n"
-"					{\n"
-"						if ((curPair+p)<maxPairs)\n"
-"						{\n"
-"							int4 tmpPair;\n"
-"							tmpPair.x = myPairs[p].x;\n"
-"							tmpPair.y = myPairs[p].y;\n"
-"							tmpPair.z = NEW_PAIR_MARKER;\n"
-"							tmpPair.w = NEW_PAIR_MARKER;\n"
-"							\n"
-"							pairsOut[curPair+p] = tmpPair; //flush to main memory\n"
-"						}\n"
-"					}\n"
-"					curNumPairs = 0;\n"
-"				}\n"
-"			}\n"
-"		}\n"
-"		barrier(CLK_LOCAL_MEM_FENCE);\n"
-"		\n"
-"		localCount++;\n"
-"		if (localCount==64)\n"
-"		{\n"
-"			localCount = 0;\n"
-"			block+=64;			\n"
-"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
-"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
-"		}\n"
-"		j++;\n"
-"		\n"
-"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
-"	\n"
-"	\n"
-"	if (curNumPairs>0)\n"
-"	{\n"
-"		//avoid a buffer overrun\n"
-"		int curPair = atomic_add(pairCount,curNumPairs);\n"
-"		for (int p=0;p<curNumPairs;p++)\n"
-"		{\n"
-"				if ((curPair+p)<maxPairs)\n"
-"				{\n"
-"					int4 tmpPair;\n"
-"					tmpPair.x = myPairs[p].x;\n"
-"					tmpPair.y = myPairs[p].y;\n"
-"					tmpPair.z = NEW_PAIR_MARKER;\n"
-"					tmpPair.w = NEW_PAIR_MARKER;\n"
-"					pairsOut[curPair+p] = tmpPair; //flush to main memory\n"
-"				}\n"
-"		}\n"
-"		curNumPairs = 0;\n"
-"	}\n"
-"}\n"
-;
diff --git a/src/BulletDynamics/Featherstone/btMultiBody.h b/src/BulletDynamics/Featherstone/btMultiBody.h
index a4243ddcd..d08c0b09c 100644
--- a/src/BulletDynamics/Featherstone/btMultiBody.h
+++ b/src/BulletDynamics/Featherstone/btMultiBody.h
@@ -418,6 +418,11 @@ public:
 		m_canSleep = canSleep;
 	}
 
+	bool getCanSleep()const
+	{
+		return m_canSleep;
+	}
+
     bool isAwake() const { return m_awake; }
     void wakeUp();
     void goToSleep();
diff --git a/test/OpenCL/AllBullet3Kernels/initCL.h b/test/OpenCL/AllBullet3Kernels/initCL.h
new file mode 100644
index 000000000..4d05971ff
--- /dev/null
+++ b/test/OpenCL/AllBullet3Kernels/initCL.h
@@ -0,0 +1,79 @@
+
+#ifndef INIT_CL_H
+#define INIT_CL_H
+
+
+
+
+void initCL()
+		{
+
+			int preferredDeviceIndex=-1;
+			int preferredPlatformIndex=-1;
+			bool allowCpuOpenCL=false;
+
+			b3CommandLineArgs args(gArgc,gArgv);
+			args.GetCmdLineArgument("cl_device", preferredDeviceIndex);
+			args.GetCmdLineArgument("cl_platform", preferredPlatformIndex);
+			allowCpuOpenCL = args.CheckCmdLineFlag("allow_opencl_cpu");
+	
+			void* glCtx=0;
+			void* glDC = 0;
+	
+	
+    
+			int ciErrNum = 0;
+
+			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+			if (allowCpuOpenCL)
+				deviceType = CL_DEVICE_TYPE_ALL;
+
+	
+	
+			//	if (useInterop)
+			//	{
+			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+			//	} else
+			{
+				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
+				ASSERT_FALSE(m_clContext==0);
+			}
+	
+			b3OpenCLPlatformInfo platformInfo;
+			b3OpenCLUtils::getPlatformInfo(m_platformId,&platformInfo);
+			b3Printf("OpenCL Platform Name %s\n", platformInfo.m_platformName);
+			b3Printf("OpenCL Platform Vendor %s\n", platformInfo.m_platformVendor);
+			b3Printf("OpenCL Platform Version %s\n", platformInfo.m_platformVersion);
+			
+	
+			ASSERT_EQ(ciErrNum, CL_SUCCESS);
+	
+			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
+			EXPECT_GT(numDev,0);
+
+			if (numDev>0)
+			{
+				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
+				ASSERT_FALSE(m_clDevice==0);
+
+				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
+				ASSERT_FALSE(m_clQueue==0);
+				
+				ASSERT_EQ(ciErrNum, CL_SUCCESS);
+        
+        
+				b3OpenCLDeviceInfo info;
+				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
+				b3OpenCLUtils::printDeviceInfo(m_clDevice);
+				m_clDeviceName = info.m_deviceName;
+			}
+		}
+
+		void	exitCL()
+		{
+			clReleaseCommandQueue(m_clQueue);
+			clReleaseContext(m_clContext);
+		}
+
+#endif //INIT_CL_H
+
diff --git a/test/OpenCL/AllBullet3Kernels/testCompileBullet3BroadphaseKernels.cpp b/test/OpenCL/AllBullet3Kernels/testCompileBullet3BroadphaseKernels.cpp
index 66fec170b..347a860e9 100644
--- a/test/OpenCL/AllBullet3Kernels/testCompileBullet3BroadphaseKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testCompileBullet3BroadphaseKernels.cpp
@@ -4,7 +4,6 @@
 #include "Bullet3Common/b3CommandLineArgs.h"
 #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 #include "Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h"
-#include "Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h"
 #include "Bullet3OpenCL/BroadphaseCollision/kernels/gridBroadphaseKernels.h"
 
 extern int gArgc;
@@ -34,7 +33,7 @@ namespace
 			bool allowCpuOpenCL = false;
 
 
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~CompileBullet3BroadphaseKernels() 
@@ -46,58 +45,9 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
-	
-	
-    
-			int ciErrNum = 0;
+		#include "initCL.h"
 
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
+		
 
 		virtual void SetUp() 
 		{
@@ -114,42 +64,7 @@ namespace
 		}
 	};
 
-	TEST_F(CompileBullet3BroadphaseKernels,sapFastKernels)
-	{
-		cl_int errNum=0;
 	
-		cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_clContext,m_clDevice,sapFastCL,&errNum,"",0,true);
-
-		{
-			cl_kernel k = b3OpenCLUtils::compileCLKernelFromString(m_clContext, m_clDevice,sapFastCL, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
-			ASSERT_EQ(errNum,CL_SUCCESS);
-			ASSERT_FALSE(k==0);
-			clReleaseKernel(k);
-		}
-		{
-			cl_kernel k= b3OpenCLUtils::compileCLKernelFromString(m_clContext, m_clDevice,sapFastCL, "computePairsIncremental3dSapKernel",&errNum,sapFastProg );
-			ASSERT_EQ(errNum,CL_SUCCESS);
-			ASSERT_FALSE(k==0);
-			clReleaseKernel(k);
-		}
-		
-		{
-			cl_kernel k = b3OpenCLUtils::compileCLKernelFromString(m_clContext, m_clDevice,sapFastCL, "computePairsKernelLocalSharedMemoryBatchWrite",&errNum,sapFastProg );
-			ASSERT_EQ(errNum,CL_SUCCESS);
-			ASSERT_FALSE(k==0);
-			clReleaseKernel(k);
-		}
-
-		{
-			cl_kernel m_computePairsIncremental3dSapKernel= b3OpenCLUtils::compileCLKernelFromString(m_clContext, m_clDevice,sapFastCL, "computePairsIncremental3dSapKernel",&errNum,sapFastProg );
-			ASSERT_EQ(errNum,CL_SUCCESS);
-			ASSERT_FALSE(m_computePairsIncremental3dSapKernel==0);
-			clReleaseKernel(m_computePairsIncremental3dSapKernel);
-		}
-
-		clReleaseProgram(sapFastProg);
-		
-	}
 
 	TEST_F(CompileBullet3BroadphaseKernels,sapKernels)
 	{
diff --git a/test/OpenCL/AllBullet3Kernels/testCompileBullet3IntegrateUpdateAabbKernels.cpp b/test/OpenCL/AllBullet3Kernels/testCompileBullet3IntegrateUpdateAabbKernels.cpp
index 63396191d..3b6f3d426 100644
--- a/test/OpenCL/AllBullet3Kernels/testCompileBullet3IntegrateUpdateAabbKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testCompileBullet3IntegrateUpdateAabbKernels.cpp
@@ -35,7 +35,7 @@ namespace
 			bool allowCpuOpenCL = false;
 
 
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~testCompileBullet3IntegrateUpdateAabbKernels() 
@@ -47,58 +47,8 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
 	
-	
-    
-			int ciErrNum = 0;
-
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
+		#include "initCL.h"
 
 		virtual void SetUp() 
 		{
diff --git a/test/OpenCL/AllBullet3Kernels/testCompileBullet3JacobiContactSolverKernels.cpp b/test/OpenCL/AllBullet3Kernels/testCompileBullet3JacobiContactSolverKernels.cpp
index 175597f7a..d9af49e90 100644
--- a/test/OpenCL/AllBullet3Kernels/testCompileBullet3JacobiContactSolverKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testCompileBullet3JacobiContactSolverKernels.cpp
@@ -33,7 +33,7 @@ namespace
 			bool allowCpuOpenCL = false;
 
 
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~CompileBullet3JacobiContactSolverKernels() 
@@ -45,58 +45,8 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
-	
-	
-    
-			int ciErrNum = 0;
+		#include "initCL.h"
 
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
 
 		virtual void SetUp() 
 		{
diff --git a/test/OpenCL/AllBullet3Kernels/testCompileBullet3NarrowphaseKernels.cpp b/test/OpenCL/AllBullet3Kernels/testCompileBullet3NarrowphaseKernels.cpp
index f8044dac4..98520fe18 100644
--- a/test/OpenCL/AllBullet3Kernels/testCompileBullet3NarrowphaseKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testCompileBullet3NarrowphaseKernels.cpp
@@ -39,7 +39,7 @@ namespace
 			bool allowCpuOpenCL = false;
 
 
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~CompileBullet3NarrowphaseKernels() 
@@ -51,58 +51,7 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
-	
-	
-    
-			int ciErrNum = 0;
-
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
+		#include "initCL.h"
 
 		virtual void SetUp() 
 		{
diff --git a/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsContactSolverKernels.cpp b/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsContactSolverKernels.cpp
index 562902077..e5ca5d1ea 100644
--- a/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsContactSolverKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsContactSolverKernels.cpp
@@ -39,7 +39,7 @@ namespace
 			bool allowCpuOpenCL = false;
 
 
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~CompileBullet3PgsContactSolverKernels() 
@@ -51,58 +51,7 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
-	
-	
-    
-			int ciErrNum = 0;
-
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
+		#include "initCL.h"
 
 		virtual void SetUp() 
 		{
diff --git a/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsJointSolverKernels.cpp b/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsJointSolverKernels.cpp
index f9dfeef22..e0b59e374 100644
--- a/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsJointSolverKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testCompileBullet3PgsJointSolverKernels.cpp
@@ -33,7 +33,7 @@ namespace
 			bool allowCpuOpenCL = false;
 
 
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~testCompileBullet3PgsJointSolverKernels() 
@@ -45,58 +45,7 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
-	
-	
-    
-			int ciErrNum = 0;
-
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
+		#include "initCL.h"
 
 		virtual void SetUp() 
 		{
diff --git a/test/OpenCL/AllBullet3Kernels/testCompileBullet3RaycastKernels.cpp b/test/OpenCL/AllBullet3Kernels/testCompileBullet3RaycastKernels.cpp
index 7e80dda33..d3dbf4444 100644
--- a/test/OpenCL/AllBullet3Kernels/testCompileBullet3RaycastKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testCompileBullet3RaycastKernels.cpp
@@ -33,7 +33,7 @@ namespace
 			bool allowCpuOpenCL = false;
 
 
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~CompileBullet3RaycastKernels() 
@@ -45,58 +45,8 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
 	
-	
-    
-			int ciErrNum = 0;
-
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
+		#include "initCL.h"
 
 		virtual void SetUp() 
 		{
diff --git a/test/OpenCL/AllBullet3Kernels/testExecuteBullet3NarrowphaseKernels.cpp b/test/OpenCL/AllBullet3Kernels/testExecuteBullet3NarrowphaseKernels.cpp
index b11de392c..15c64196a 100644
--- a/test/OpenCL/AllBullet3Kernels/testExecuteBullet3NarrowphaseKernels.cpp
+++ b/test/OpenCL/AllBullet3Kernels/testExecuteBullet3NarrowphaseKernels.cpp
@@ -1,9 +1,9 @@
 
 #include <gtest/gtest.h>
 #include "Bullet3Common/b3Logging.h"
-#include "Bullet3Common/b3CommandLineArgs.h"
-#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
 
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "Bullet3Common/b3CommandLineArgs.h"
 #include "Bullet3OpenCL/NarrowphaseCollision/kernels/satKernels.h"
 #include "Bullet3OpenCL/NarrowphaseCollision/kernels/mprKernels.h"
 #include "Bullet3OpenCL/NarrowphaseCollision/kernels/satConcaveKernels.h"
@@ -37,13 +37,8 @@ namespace
 			m_platformId(0)
 		{
 				// You can do set-up work for each test here.
-			b3CommandLineArgs args(gArgc,gArgv);
-			int preferredDeviceIndex=-1;
-			int preferredPlatformIndex = -1;
-			bool allowCpuOpenCL = false;
 
-
-			initCL(preferredDeviceIndex, preferredPlatformIndex, allowCpuOpenCL);
+			initCL();
 		}
 
 		virtual ~ExecuteBullet3NarrowphaseKernels() 
@@ -55,59 +50,8 @@ namespace
 		// If the constructor and destructor are not enough for setting up
 		// and cleaning up each test, you can define the following methods:
 
-		void initCL(int preferredDeviceIndex, int preferredPlatformIndex, bool allowCpuOpenCL)
-		{
-			void* glCtx=0;
-			void* glDC = 0;
-	
-	
-    
-			int ciErrNum = 0;
-
-			cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
-			if (allowCpuOpenCL)
-				deviceType = CL_DEVICE_TYPE_ALL;
-
-	
-	
-			//	if (useInterop)
-			//	{
-			//		m_data->m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
-			//	} else
-			{
-				m_clContext = b3OpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex,&m_platformId);
-				ASSERT_FALSE(m_clContext==0);
-			}
-	
-	
-			ASSERT_EQ(ciErrNum, CL_SUCCESS);
-	
-			int numDev = b3OpenCLUtils::getNumDevices(m_clContext);
-			EXPECT_GT(numDev,0);
-
-			if (numDev>0)
-			{
-				m_clDevice= b3OpenCLUtils::getDevice(m_clContext,0);
-				ASSERT_FALSE(m_clDevice==0);
-
-				m_clQueue = clCreateCommandQueue(m_clContext, m_clDevice, 0, &ciErrNum);
-				ASSERT_FALSE(m_clQueue==0);
-				
-				ASSERT_EQ(ciErrNum, CL_SUCCESS);
-        
-        
-				b3OpenCLDeviceInfo info;
-				b3OpenCLUtils::getDeviceInfo(m_clDevice,&info);
-				m_clDeviceName = info.m_deviceName;
-			}
-		}
-
-		void	exitCL()
-		{
-			clReleaseCommandQueue(m_clQueue);
-			clReleaseContext(m_clContext);
-		}
-
+		#include "initCL.h"
+		
 		virtual void SetUp() 
 		{
 
diff --git a/test/TestBullet3OpenCL/main.cpp b/test/TestBullet3OpenCL/main.cpp
index f74b9d215..4a18ce702 100644
--- a/test/TestBullet3OpenCL/main.cpp
+++ b/test/TestBullet3OpenCL/main.cpp
@@ -9,7 +9,7 @@ void myerrorprintf(const char* msg)
 	printf(msg);
 }
 	
-static bool sVerboseWarning = false;
+static bool sVerboseWarning = true;
 
 void mywarningprintf(const char* msg)
 {
@@ -20,7 +20,7 @@ void mywarningprintf(const char* msg)
 	}
 }
 
-static bool sVerbosePrintf=false;
+static bool sVerbosePrintf=true;//false;
 
 void myprintf(const char* msg)
 {
diff --git a/test/TestBullet3OpenCL/premake4.lua b/test/TestBullet3OpenCL/premake4.lua
index d0fa62745..f2a6e0042 100644
--- a/test/TestBullet3OpenCL/premake4.lua
+++ b/test/TestBullet3OpenCL/premake4.lua
@@ -6,6 +6,12 @@ function createProject(vendor)
 	
 --	defines {  }
 	
+	if os.is("Windows") then
+		--see http://stackoverflow.com/questions/12558327/google-test-in-visual-studio-2012
+		defines {"_VARIADIC_MAX=10"}
+	end
+	
+	
 	targetdir "../../bin"
 	
 	initOpenCL(vendor)
diff --git a/test/gtest-1.7.0/premake4.lua b/test/gtest-1.7.0/premake4.lua
index c7e9e1931..5fdf84042 100644
--- a/test/gtest-1.7.0/premake4.lua
+++ b/test/gtest-1.7.0/premake4.lua
@@ -5,6 +5,10 @@
 	files{"src/gtest-all.cc"}
 
 	--defines {"GTEST_HAS_PTHREAD=1"}	
+	
+	--see http://stackoverflow.com/questions/12558327/google-test-in-visual-studio-2012
+	defines {"_VARIADIC_MAX=10"}	
+	
 	--targetdir "../../lib"	
 
 	includedirs {