From 9a7414f4e979333cecfc44e374492e15a47c7c0b Mon Sep 17 00:00:00 2001
From: erwin coumans <erwin.coumans@gmail.com>
Date: Fri, 15 Mar 2013 16:27:23 -0700
Subject: [PATCH] cleanup of gpu rigid body (removed all Adl stuff)

---
 build/stringify.bat                           |   2 +
 demo/gpudemo/GpuDemo.cpp                      |   4 +-
 demo/gpudemo/GpuDemo.h                        |   6 +-
 demo/gpudemo/broadphase/PairBench.cpp         |   6 +-
 demo/gpudemo/main_opengl3core.cpp             |   5 +
 demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp   | 262 +++++++++
 demo/gpudemo/rigidbody/GpuRigidBodyDemo.h     |  43 ++
 .../gpu_broadphase/host/btGpuSapBroadphase.h  |  15 +-
 opencl/gpu_broadphase/host/btSapAabb.h        |  18 +
 opencl/gpu_rigidbody/host/btConfig.h          |  34 ++
 .../gpu_rigidbody/host/btGpuNarrowPhase.cpp   | 504 ++++++++++++++++++
 opencl/gpu_rigidbody/host/btGpuNarrowPhase.h  |  76 +++
 .../host/btGpuRigidBodyPipeline.cpp           | 171 ++++++
 .../host/btGpuRigidBodyPipeline.h             |  38 ++
 .../host/btGpuRigidBodyPipelineInternalData.h |  26 +
 .../gpu_rigidbody/kernels/integrateKernel.cl  |  89 ++++
 .../gpu_rigidbody/kernels/integrateKernel.h   |  93 ++++
 opencl/gpu_sat/host/btCollidable.h            |  12 +
 opencl/gpu_sat/host/btRigidBodyCL.h           |   2 +-
 19 files changed, 1383 insertions(+), 23 deletions(-)
 create mode 100644 demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp
 create mode 100644 demo/gpudemo/rigidbody/GpuRigidBodyDemo.h
 create mode 100644 opencl/gpu_broadphase/host/btSapAabb.h
 create mode 100644 opencl/gpu_rigidbody/host/btConfig.h
 create mode 100644 opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp
 create mode 100644 opencl/gpu_rigidbody/host/btGpuNarrowPhase.h
 create mode 100644 opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
 create mode 100644 opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.h
 create mode 100644 opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h
 create mode 100644 opencl/gpu_rigidbody/kernels/integrateKernel.cl
 create mode 100644 opencl/gpu_rigidbody/kernels/integrateKernel.h

diff --git a/build/stringify.bat b/build/stringify.bat
index 0c04729f6..74fd737ee 100644
--- a/build/stringify.bat
+++ b/build/stringify.bat
@@ -14,6 +14,8 @@ premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kerne
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/sat.cl" --headerfile="../opencl/gpu_sat/kernels/satKernels.h" --stringname="satKernelsCL" stringify
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/satClipHullContacts.cl" --headerfile="../opencl/gpu_sat/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
 
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/integrateKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
+
 
 
 pause
\ No newline at end of file
diff --git a/demo/gpudemo/GpuDemo.cpp b/demo/gpudemo/GpuDemo.cpp
index afaef84da..ff257294c 100644
--- a/demo/gpudemo/GpuDemo.cpp
+++ b/demo/gpudemo/GpuDemo.cpp
@@ -40,9 +40,9 @@ void GpuDemo::initCL(int preferredDeviceIndex, int preferredPlatformIndex)
     
 	int ciErrNum = 0;
 	//#ifdef CL_PLATFORM_INTEL
-		cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+	//cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
 	//#else
-	//cl_device_type deviceType = CL_DEVICE_TYPE_CPU;
+	cl_device_type deviceType = CL_DEVICE_TYPE_CPU;
 	//#endif
 	
 	cl_platform_id platformId;
diff --git a/demo/gpudemo/GpuDemo.h b/demo/gpudemo/GpuDemo.h
index b4dd85d8f..0815b79ac 100644
--- a/demo/gpudemo/GpuDemo.h
+++ b/demo/gpudemo/GpuDemo.h
@@ -34,9 +34,9 @@ public:
                         :useOpenCL(false),//true),
                         preferredOpenCLPlatformIndex(-1),
                         preferredOpenCLDeviceIndex(-1),
-                        arraySizeX(10),
-                        arraySizeY(10 ),
-                        arraySizeZ(10),
+                        arraySizeX(2),
+                        arraySizeY(2 ),
+                        arraySizeZ(2),
                         m_useConcaveMesh(false),
                         gapX(4.3),
                         gapY(4.0),
diff --git a/demo/gpudemo/broadphase/PairBench.cpp b/demo/gpudemo/broadphase/PairBench.cpp
index 84f480eb9..50b4a9fb3 100644
--- a/demo/gpudemo/broadphase/PairBench.cpp
+++ b/demo/gpudemo/broadphase/PairBench.cpp
@@ -11,12 +11,12 @@
 #include "OpenGLWindow/GLInstanceRendererInternalData.h"
 #include "parallel_primitives/host/btLauncherCL.h"
 
-btKeyboardCallback oldCallback = 0;
+static btKeyboardCallback oldCallback = 0;
 extern bool gReset;
 
 #define MSTRINGIFY(A) #A
 
-const char* s_pairBenchKernelString = MSTRINGIFY(
+static const char* s_pairBenchKernelString = MSTRINGIFY(
 __kernel void moveObjectsKernel(__global float4* posOrnColors, int numObjects)
 {
 	int iGID = get_global_id(0);
@@ -134,7 +134,7 @@ PairBench::~PairBench()
 
 
 
-void PairKeyboardCallback(int key, int state)
+static void PairKeyboardCallback(int key, int state)
 {
 	if (key=='R' && state)
 	{
diff --git a/demo/gpudemo/main_opengl3core.cpp b/demo/gpudemo/main_opengl3core.cpp
index b3e58543c..654165b42 100644
--- a/demo/gpudemo/main_opengl3core.cpp
+++ b/demo/gpudemo/main_opengl3core.cpp
@@ -25,6 +25,7 @@
 #include "gwenUserInterface.h"
 #include "ParticleDemo.h"
 #include "broadphase/PairBench.h"
+#include "rigidbody/GpuRigidBodyDemo.h"
 
 //#include "BroadphaseBenchmark.h"
 
@@ -63,8 +64,11 @@ GpuDemo::CreateFunc* allDemos[]=
 	//BroadphaseBenchmark::CreateFunc,
 	//GpuBoxDemo::CreateFunc,
 	PairBench::MyCreateFunc,	
+	GpuRigidBodyDemo::MyCreateFunc,
+
 	ParticleDemo::MyCreateFunc,
 	
+	
 	//SpheresDemo::CreateFunc,
 	//GpuCompoundDemo::CreateFunc,
 	//EmptyDemo::CreateFunc,
@@ -664,6 +668,7 @@ int main(int argc, char* argv[])
 
 
 		demo->exitPhysics();
+		CProfileManager::CleanupMemory();
 		delete demo;
 		if (f)
 			fclose(f);
diff --git a/demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp b/demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp
new file mode 100644
index 000000000..eb578f097
--- /dev/null
+++ b/demo/gpudemo/rigidbody/GpuRigidBodyDemo.cpp
@@ -0,0 +1,262 @@
+#include "GpuRigidBodyDemo.h"
+#include "BulletCommon/btQuickprof.h"
+#include "OpenGLWindow/ShapeData.h"
+#include "OpenGLWindow/GLInstancingRenderer.h"
+#include "BulletCommon/btQuaternion.h"
+#include "OpenGLWindow/btgWindowInterface.h"
+#include "gpu_broadphase/host/btGpuSapBroadphase.h"
+#include "../GpuDemoInternalData.h"
+#include "basic_initialize/btOpenCLUtils.h"
+#include "OpenGLWindow/OpenGLInclude.h"
+#include "OpenGLWindow/GLInstanceRendererInternalData.h"
+#include "parallel_primitives/host/btLauncherCL.h"
+#include "gpu_rigidbody/host/btGpuRigidBodyPipeline.h"
+#include "gpu_rigidbody/host/btGpuNarrowPhase.h"
+#include "gpu_rigidbody/host/btConfig.h"
+
+static btKeyboardCallback oldCallback = 0;
+extern bool gReset;
+
+#define MSTRINGIFY(A) #A
+
+static const char* s_rigidBodyKernelString = MSTRINGIFY(
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+	unsigned int m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+
+__kernel void 
+	copyTransformsToVBOKernel( __global Body* gBodies, __global float4* posOrnColor, const int numNodes)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{
+		posOrnColor[nodeID] = (float4) (gBodies[nodeID].m_pos.xyz,1.0);
+		posOrnColor[nodeID + numNodes] = gBodies[nodeID].m_quat;
+	}
+}
+);
+
+
+struct	GpuRigidBodyDemoInternalData
+{
+	
+	cl_kernel	m_copyTransformsToVBOKernel;
+
+	btOpenCLArray<btVector4>*	m_instancePosOrnColor;
+
+	class btGpuRigidBodyPipeline* m_rigidBodyPipeline;
+
+	btGpuNarrowPhase* m_np;
+	btGpuSapBroadphase* m_bp;
+
+	GpuRigidBodyDemoInternalData()
+		:m_instancePosOrnColor(0),
+		m_copyTransformsToVBOKernel(0),	m_rigidBodyPipeline(0),
+		m_np(0),
+		m_bp(0)
+	{
+	}
+};
+
+
+GpuRigidBodyDemo::GpuRigidBodyDemo()
+:m_instancingRenderer(0),
+m_window(0)
+{
+	m_data = new GpuRigidBodyDemoInternalData;
+}
+GpuRigidBodyDemo::~GpuRigidBodyDemo()
+{
+	
+	delete m_data;
+}
+
+
+
+
+
+
+
+static void PairKeyboardCallback(int key, int state)
+{
+	if (key=='R' && state)
+	{
+		gReset = true;
+	}
+	
+	//btDefaultKeyboardCallback(key,state);
+	oldCallback(key,state);
+}
+
+
+
+void	GpuRigidBodyDemo::initPhysics(const ConstructionInfo& ci)
+{
+	initCL(ci.preferredOpenCLDeviceIndex,ci.preferredOpenCLPlatformIndex);
+	if (m_clData->m_clContext)
+	{
+		int errNum=0;
+
+		cl_program rbProg=0;
+		m_data->m_copyTransformsToVBOKernel = btOpenCLUtils::compileCLKernelFromString(m_clData->m_clContext,m_clData->m_clDevice,s_rigidBodyKernelString,"copyTransformsToVBOKernel",&errNum,rbProg);
+		
+		btConfig config;
+		btGpuNarrowPhase* np = new btGpuNarrowPhase(m_clData->m_clContext,m_clData->m_clDevice,m_clData->m_clQueue,config);
+		btGpuSapBroadphase* bp = new btGpuSapBroadphase(m_clData->m_clContext,m_clData->m_clDevice,m_clData->m_clQueue);
+		m_data->m_np = np;
+		m_data->m_bp = bp;
+
+		m_data->m_rigidBodyPipeline = new btGpuRigidBodyPipeline(m_clData->m_clContext,m_clData->m_clDevice,m_clData->m_clQueue, np, bp);
+
+		int strideInBytes = 9*sizeof(float);
+		int numVertices = sizeof(cube_vertices)/strideInBytes;
+		int numIndices = sizeof(cube_vertices)/sizeof(int);
+		//int shapeId = ci.m_instancingRenderer->registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
+		int shapeId = ci.m_instancingRenderer->registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
+		int group=1;
+		int mask=1;
+		int index=10;
+		float scaling[4] = {1,1,1,1};
+
+		int colIndex = np->registerConvexHullShape(&cube_vertices[0],strideInBytes,numVertices, scaling);
+
+		float mass = 1.f;
+		for (int i=0;i<ci.arraySizeX;i++)
+		{
+			for (int j=0;j<ci.arraySizeY;j++)
+			{
+				for (int k=0;k<ci.arraySizeZ;k++)
+				{
+					btVector3 position(k*3,i*3,j*3);
+					btQuaternion orn(1,0,0,0);
+				
+					btVector4 color(0,1,0,1);
+					btVector4 scaling(1,1,1,1);
+					int id = ci.m_instancingRenderer->registerGraphicsInstance(shapeId,position,orn,color,scaling);
+					int pid = m_data->m_rigidBodyPipeline->registerPhysicsInstance(mass,position,orn,colIndex,index);
+				
+					index++;
+				}
+			}
+		}
+		np->writeAllBodiesToGpu();
+	}
+
+	
+	
+	
+	
+	if (ci.m_window)
+	{
+		m_window = ci.m_window;
+		oldCallback = ci.m_window->getKeyboardCallback();
+		ci.m_window->setKeyboardCallback(PairKeyboardCallback);
+
+	}
+
+	m_instancingRenderer = ci.m_instancingRenderer;
+
+
+	float camPos[4]={15.5,12.5,15.5,0};
+	m_instancingRenderer->setCameraTargetPosition(camPos);
+	m_instancingRenderer->setCameraDistance(60);
+
+	m_instancingRenderer->writeTransforms();
+	
+	
+
+}
+
+void	GpuRigidBodyDemo::exitPhysics()
+{
+	delete m_data->m_instancePosOrnColor;
+	delete m_data->m_rigidBodyPipeline;
+
+	m_window->setKeyboardCallback(oldCallback);
+	
+	delete m_data->m_np;
+	m_data->m_np = 0;
+	delete m_data->m_bp;
+	m_data->m_bp = 0;
+
+	exitCL();
+}
+
+
+void GpuRigidBodyDemo::renderScene()
+{
+	m_instancingRenderer->RenderScene();
+}
+
+void GpuRigidBodyDemo::clientMoveAndDisplay()
+{
+	bool animate=true;
+	int numObjects= m_instancingRenderer->getInternalData()->m_totalNumInstances;
+	btVector4* positions = 0;
+	if (animate)
+	{
+		GLuint vbo = m_instancingRenderer->getInternalData()->m_vbo;
+		int arraySizeInBytes  = numObjects * (3)*sizeof(btVector4);
+		glBindBuffer(GL_ARRAY_BUFFER, vbo);
+		cl_bool blocking=  CL_TRUE;
+		positions=  (btVector4*)glMapBufferRange( GL_ARRAY_BUFFER,m_instancingRenderer->getMaxShapeCapacity(),arraySizeInBytes, GL_MAP_WRITE_BIT|GL_MAP_READ_BIT );//GL_READ_WRITE);//GL_WRITE_ONLY
+		GLint err = glGetError();
+		assert(err==GL_NO_ERROR);
+		if (!m_data->m_instancePosOrnColor)
+		{
+			m_data->m_instancePosOrnColor = new btOpenCLArray<btVector4>(m_clData->m_clContext,m_clData->m_clQueue);
+			m_data->m_instancePosOrnColor->resize(3*numObjects);
+			m_data->m_instancePosOrnColor->copyFromHostPointer(positions,3*numObjects,0);
+		}
+	}
+
+	m_data->m_rigidBodyPipeline->stepSimulation(1./60.f);
+
+	{
+		int ciErrNum = 0;
+
+
+		ciErrNum = 0;//clSetKernelArg(fpio.m_copyTransformsToVBOKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);	
+		
+		cl_mem bodies = m_data->m_rigidBodyPipeline->getBodyBuffer();
+		btLauncherCL launch(m_clData->m_clQueue,m_data->m_copyTransformsToVBOKernel);
+		launch.setBuffer(bodies);
+		launch.setBuffer(m_data->m_instancePosOrnColor->getBufferCL());
+		launch.setConst(numObjects);
+
+		launch.launch1D(numObjects);
+		//ciErrNum = clSetKernelArg(fpio.m_copyTransformsToVBOKernel, 3, sizeof(cl_mem), (void*)&bodies);
+		//ciErrNum = clSetKernelArg(fpio.m_copyTransformsToVBOKernel, 1, sizeof(int), &fpio.m_numObjects);
+	
+		if (numObjects)
+		{
+			size_t workGroupSize = 64;
+			size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
+				
+			//ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_copyTransformsToVBOKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		}
+
+	}
+
+	if (animate)
+	{
+		GLint err = glGetError();
+		assert(err==GL_NO_ERROR);
+		m_data->m_instancePosOrnColor->copyToHostPointer(positions,3*numObjects,0);
+		glUnmapBuffer( GL_ARRAY_BUFFER);
+
+		err = glGetError();
+		assert(err==GL_NO_ERROR);
+	}
+
+}
diff --git a/demo/gpudemo/rigidbody/GpuRigidBodyDemo.h b/demo/gpudemo/rigidbody/GpuRigidBodyDemo.h
new file mode 100644
index 000000000..a5c5f4569
--- /dev/null
+++ b/demo/gpudemo/rigidbody/GpuRigidBodyDemo.h
@@ -0,0 +1,43 @@
+#ifndef GPU_RIGID_BODY_DEMO_H
+#define GPU_RIGID_BODY_DEMO_H
+
+#include "../GpuDemo.h"
+
+class GpuRigidBodyDemo : public GpuDemo
+{
+	
+	class GLInstancingRenderer* m_instancingRenderer;
+	class btgWindowInterface*	m_window;
+
+	struct GpuRigidBodyDemoInternalData*	m_data;
+
+public:
+	
+	GpuRigidBodyDemo();
+	virtual ~GpuRigidBodyDemo();
+
+	virtual void	initPhysics(const ConstructionInfo& ci);
+	
+	virtual void	exitPhysics();
+	
+	virtual const char* getName()
+	{
+		return "GRBD";
+	}
+	static GpuDemo* MyCreateFunc()
+	{
+		GpuDemo* demo = new GpuRigidBodyDemo;
+		return demo;
+	}
+	
+	
+	
+	virtual void renderScene();
+	
+	virtual void clientMoveAndDisplay();
+
+	
+};
+
+#endif //GPU_RIGID_BODY_DEMO_H
+
diff --git a/opencl/gpu_broadphase/host/btGpuSapBroadphase.h b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
index e7483726d..c119b8ce7 100644
--- a/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
+++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
@@ -6,20 +6,7 @@
 class btVector3;
 #include "parallel_primitives/host/btRadixSort32CL.h"
 
-struct btSapAabb
-{
-	union
-	{
-		float m_min[4];
-		int m_minIndices[4];
-	};
-	union
-	{
-		float m_max[4];
-		int m_signedMaxIndices[4];
-		//unsigned int m_unsignedMaxIndices[4];
-	};
-};
+#include "btSapAabb.h"
 
 
 
diff --git a/opencl/gpu_broadphase/host/btSapAabb.h b/opencl/gpu_broadphase/host/btSapAabb.h
new file mode 100644
index 000000000..3354ffe9e
--- /dev/null
+++ b/opencl/gpu_broadphase/host/btSapAabb.h
@@ -0,0 +1,18 @@
+#ifndef BT_SAP_AABB_H
+#define BT_SAP_AABB_H
+
+struct btSapAabb
+{
+	union
+	{
+		float m_min[4];
+		int m_minIndices[4];
+	};
+	union
+	{
+		float m_max[4];
+		int m_signedMaxIndices[4];
+	};
+};
+
+#endif //BT_SAP_AABB_H
diff --git a/opencl/gpu_rigidbody/host/btConfig.h b/opencl/gpu_rigidbody/host/btConfig.h
new file mode 100644
index 000000000..71b1f1b6c
--- /dev/null
+++ b/opencl/gpu_rigidbody/host/btConfig.h
@@ -0,0 +1,34 @@
+#ifndef BT_CONFIG_H
+#define BT_CONFIG_H
+
+struct	btConfig
+{
+	int	m_maxConvexBodies;
+	int	m_maxConvexShapes;
+	int	m_maxBroadphasePairs;
+
+	int m_maxVerticesPerFace;
+	int m_maxFacesPerShape;
+	int	m_maxConvexVertices;
+	int m_maxConvexIndices;
+	int m_maxConvexUniqueEdges;
+	
+	int	m_maxCompoundChildShapes;
+
+	btConfig()
+		:m_maxConvexBodies(128*1024),
+		m_maxConvexShapes(8192),
+		m_maxVerticesPerFace(64),
+		m_maxFacesPerShape(64),
+		m_maxConvexVertices(8192),
+		m_maxConvexIndices(8192),
+		m_maxConvexUniqueEdges(8192),
+		m_maxCompoundChildShapes(8192)//??
+	{
+		m_maxBroadphasePairs = 16*m_maxConvexBodies;
+	}
+};
+
+
+#endif//BT_CONFIG_H
+
diff --git a/opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp
new file mode 100644
index 000000000..43068e086
--- /dev/null
+++ b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp
@@ -0,0 +1,504 @@
+#include "btGpuNarrowPhase.h"
+
+
+#include "parallel_primitives/host/btOpenCLArray.h"
+#include "../../gpu_sat/host/btConvexPolyhedronCL.h"
+#include "../../gpu_sat/host/ConvexHullContact.h"
+#include "../../gpu_broadphase/host/btSapAabb.h"
+#include <string.h>
+#include "btConfig.h"
+
+struct btGpuNarrowPhaseInternalData
+{
+	btAlignedObjectArray<btConvexUtility*>* m_convexData;
+    
+	btAlignedObjectArray<btConvexPolyhedronCL> m_convexPolyhedra;
+	btAlignedObjectArray<btVector3> m_uniqueEdges;
+	btAlignedObjectArray<btVector3> m_convexVertices;
+	btAlignedObjectArray<int> m_convexIndices;
+    
+	btOpenCLArray<btConvexPolyhedronCL>* m_convexPolyhedraGPU;
+	btOpenCLArray<btVector3>* m_uniqueEdgesGPU;
+	btOpenCLArray<btVector3>* m_convexVerticesGPU;
+	btOpenCLArray<int>* m_convexIndicesGPU;
+    
+    btOpenCLArray<btVector3>* m_worldVertsB1GPU;
+    btOpenCLArray<btInt4>* m_clippingFacesOutGPU;
+    btOpenCLArray<btVector3>* m_worldNormalsAGPU;
+    btOpenCLArray<btVector3>* m_worldVertsA1GPU;
+    btOpenCLArray<btVector3>* m_worldVertsB2GPU;
+    
+	btAlignedObjectArray<btGpuChildShape> m_cpuChildShapes;
+	btOpenCLArray<btGpuChildShape>*	m_gpuChildShapes;
+    
+	btAlignedObjectArray<btGpuFace> m_convexFaces;
+	btOpenCLArray<btGpuFace>* m_convexFacesGPU;
+    
+	GpuSatCollision*	m_gpuSatCollision;
+	    
+	btAlignedObjectArray<btInt2>* m_pBufPairsCPU;
+    
+	btOpenCLArray<btInt2>* m_convexPairsOutGPU;
+	btOpenCLArray<btInt2>* m_planePairs;
+    
+	btOpenCLArray<btContact4>* m_pBufContactOutGPU;
+	btAlignedObjectArray<btContact4>* m_pBufContactOutCPU;
+	
+    
+	btAlignedObjectArray<btRigidBodyCL>* m_bodyBufferCPU;
+	btOpenCLArray<btRigidBodyCL>* m_bodyBufferGPU;
+    
+	btAlignedObjectArray<btInertiaCL>*	m_inertiaBufferCPU;
+	btOpenCLArray<btInertiaCL>*	m_inertiaBufferGPU;
+    
+	int m_numAcceleratedShapes;
+	int m_numAcceleratedRigidBodies;
+    
+	btAlignedObjectArray<btCollidable>	m_collidablesCPU;
+	btOpenCLArray<btCollidable>*	m_collidablesGPU;
+
+	btOpenCLArray<btSapAabb>* m_localShapeAABBGPU;
+	btAlignedObjectArray<btSapAabb>* m_localShapeAABBCPU;
+
+	btConfig	m_config;
+    
+};
+
+
+
+
+
+btGpuNarrowPhase::btGpuNarrowPhase(cl_context ctx, cl_device_id device, cl_command_queue queue, const btConfig& config)
+:m_data(0) ,m_planeBodyIndex(-1),m_static0Index(-1),
+m_context(ctx),
+m_device(device),
+m_queue(queue)
+{
+    
+	m_data = new btGpuNarrowPhaseInternalData();
+	memset(m_data,0,sizeof(btGpuNarrowPhaseInternalData));
+    
+	m_data->m_config = config;
+	
+	m_data->m_gpuSatCollision = new GpuSatCollision(ctx,device,queue);
+	m_data->m_pBufPairsCPU = new btAlignedObjectArray<btInt2>;
+	m_data->m_pBufPairsCPU->resize(config.m_maxBroadphasePairs);
+	
+	m_data->m_convexPairsOutGPU = new btOpenCLArray<btInt2>(ctx,queue,config.m_maxBroadphasePairs,false);
+	m_data->m_planePairs = new btOpenCLArray<btInt2>(ctx,queue,config.m_maxBroadphasePairs,false);
+    
+	m_data->m_pBufContactOutCPU = new btAlignedObjectArray<btContact4>();
+	m_data->m_pBufContactOutCPU->resize(config.m_maxBroadphasePairs);
+	m_data->m_bodyBufferCPU = new btAlignedObjectArray<btRigidBodyCL>();
+	m_data->m_bodyBufferCPU->resize(config.m_maxConvexBodies);
+    
+	m_data->m_inertiaBufferCPU = new btAlignedObjectArray<btInertiaCL>();
+	m_data->m_inertiaBufferCPU->resize(config.m_maxConvexBodies);
+	
+	m_data->m_pBufContactOutGPU = new btOpenCLArray<btContact4>(ctx,queue, config.m_maxBroadphasePairs,true);
+	btContact4 test = m_data->m_pBufContactOutGPU->forcedAt(0);
+
+	m_data->m_inertiaBufferGPU = new btOpenCLArray<btInertiaCL>(ctx,queue,config.m_maxConvexBodies,false);
+	m_data->m_collidablesGPU = new btOpenCLArray<btCollidable>(ctx,queue,config.m_maxConvexShapes);
+
+	m_data->m_localShapeAABBCPU = new btAlignedObjectArray<btSapAabb>;
+	m_data->m_localShapeAABBGPU = new btOpenCLArray<btSapAabb>(ctx,queue,config.m_maxConvexShapes);
+    
+    
+	//m_data->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate(ctx,queue, config.m_maxBroadphasePairs,false);
+	m_data->m_bodyBufferGPU = new btOpenCLArray<btRigidBodyCL>(ctx,queue, config.m_maxConvexBodies,false);
+
+	m_data->m_convexFacesGPU = new btOpenCLArray<btGpuFace>(ctx,queue,config.m_maxConvexShapes*config.m_maxFacesPerShape,false);
+	m_data->m_gpuChildShapes = new btOpenCLArray<btGpuChildShape>(ctx,queue,config.m_maxCompoundChildShapes,false);
+	
+	m_data->m_convexPolyhedraGPU = new btOpenCLArray<btConvexPolyhedronCL>(ctx,queue,config.m_maxConvexShapes,false);
+	m_data->m_uniqueEdgesGPU = new btOpenCLArray<btVector3>(ctx,queue,config.m_maxConvexUniqueEdges,true);
+	m_data->m_convexVerticesGPU = new btOpenCLArray<btVector3>(ctx,queue,config.m_maxConvexVertices,true);
+	m_data->m_convexIndicesGPU = new btOpenCLArray<int>(ctx,queue,config.m_maxConvexIndices,true);
+    
+    
+	m_data->m_worldVertsB1GPU = new btOpenCLArray<btVector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
+    m_data->m_clippingFacesOutGPU = new  btOpenCLArray<btInt4>(ctx,queue,config.m_maxConvexBodies);
+    m_data->m_worldNormalsAGPU = new  btOpenCLArray<btVector3>(ctx,queue,config.m_maxConvexBodies);
+	m_data->m_worldVertsA1GPU = new btOpenCLArray<btVector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
+    m_data->m_worldVertsB2GPU = new  btOpenCLArray<btVector3>(ctx,queue,config.m_maxConvexBodies*config.m_maxVerticesPerFace);
+    
+    
+
+	m_data->m_convexData = new btAlignedObjectArray<btConvexUtility* >();
+    
+
+	m_data->m_convexData->resize(config.m_maxConvexShapes);
+	m_data->m_convexPolyhedra.resize(config.m_maxConvexShapes);
+    
+	m_data->m_numAcceleratedShapes = 0;
+	m_data->m_numAcceleratedRigidBodies = 0;
+    
+	//m_data->m_contactCGPU = new btOpenCLArray<Constraint4>(ctx,queue,config.m_maxBroadphasePairs,false);
+	//m_data->m_frictionCGPU = new btOpenCLArray<adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_data->m_deviceCL, config.m_maxBroadphasePairs);
+    
+}
+
+
+btGpuNarrowPhase::~btGpuNarrowPhase()
+{
+	delete m_data->m_gpuSatCollision;
+	delete m_data->m_pBufPairsCPU;
+	delete m_data->m_convexPairsOutGPU;
+	delete m_data->m_planePairs;
+	delete m_data->m_pBufContactOutCPU;
+	delete m_data->m_bodyBufferCPU;
+	delete m_data->m_inertiaBufferCPU;
+	delete m_data->m_pBufContactOutGPU;
+	delete m_data->m_inertiaBufferGPU;
+	delete m_data->m_collidablesGPU;
+	delete m_data->m_localShapeAABBCPU;
+	delete m_data->m_localShapeAABBGPU;
+	delete m_data->m_bodyBufferGPU;
+	delete m_data->m_convexFacesGPU;
+	delete m_data->m_gpuChildShapes;
+	delete m_data->m_convexPolyhedraGPU;
+	delete m_data->m_uniqueEdgesGPU;
+	delete m_data->m_convexVerticesGPU;
+	delete m_data->m_convexIndicesGPU;
+	delete m_data->m_worldVertsB1GPU;
+    delete m_data->m_clippingFacesOutGPU;
+    delete m_data->m_worldNormalsAGPU;
+	delete m_data->m_worldVertsA1GPU;
+    delete m_data->m_worldVertsB2GPU;
+    
+    
+    delete m_data->m_convexData;
+	delete m_data;
+}
+
+
+int	btGpuNarrowPhase::allocateCollidable()
+{
+	int curSize = m_data->m_collidablesCPU.size();
+	m_data->m_collidablesCPU.expand();
+	return curSize;
+}
+
+
+
+int btGpuNarrowPhase::registerConvexHullShape(btConvexUtility* convexPtr,btCollidable& col)
+{
+	m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1);
+	m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1);
+	
+    
+	btConvexPolyhedronCL& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1);
+	convex.mC = convexPtr->mC;
+	convex.mE = convexPtr->mE;
+	convex.m_extents= convexPtr->m_extents;
+	convex.m_localCenter = convexPtr->m_localCenter;
+	convex.m_radius = convexPtr->m_radius;
+	
+	convex.m_numUniqueEdges = convexPtr->m_uniqueEdges.size();
+	int edgeOffset = m_data->m_uniqueEdges.size();
+	convex.m_uniqueEdgesOffset = edgeOffset;
+	
+	m_data->m_uniqueEdges.resize(edgeOffset+convex.m_numUniqueEdges);
+    
+	//convex data here
+	int i;
+	for ( i=0;i<convexPtr->m_uniqueEdges.size();i++)
+	{
+		m_data->m_uniqueEdges[edgeOffset+i] = convexPtr->m_uniqueEdges[i];
+	}
+    
+	int faceOffset = m_data->m_convexFaces.size();
+	convex.m_faceOffset = faceOffset;
+	convex.m_numFaces = convexPtr->m_faces.size();
+	m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces);
+	for (i=0;i<convexPtr->m_faces.size();i++)
+	{
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[0] = convexPtr->m_faces[i].m_plane[0];
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[1] = convexPtr->m_faces[i].m_plane[1];
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[2] = convexPtr->m_faces[i].m_plane[2];
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[3] = convexPtr->m_faces[i].m_plane[3];
+		int indexOffset = m_data->m_convexIndices.size();
+		int numIndices = convexPtr->m_faces[i].m_indices.size();
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices;
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset;
+		m_data->m_convexIndices.resize(indexOffset+numIndices);
+		for (int p=0;p<numIndices;p++)
+		{
+			m_data->m_convexIndices[indexOffset+p] = convexPtr->m_faces[i].m_indices[p];
+		}
+	}
+    
+	convex.m_numVertices = convexPtr->m_vertices.size();
+	int vertexOffset = m_data->m_convexVertices.size();
+	convex.m_vertexOffset =vertexOffset;
+	m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices);
+	for (int i=0;i<convexPtr->m_vertices.size();i++)
+	{
+		m_data->m_convexVertices[vertexOffset+i] = convexPtr->m_vertices[i];
+	}
+
+	(*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = convexPtr;
+	
+	m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces);
+    
+	m_data->m_convexPolyhedraGPU->copyFromHost(m_data->m_convexPolyhedra);
+	m_data->m_uniqueEdgesGPU->copyFromHost(m_data->m_uniqueEdges);
+	m_data->m_convexVerticesGPU->copyFromHost(m_data->m_convexVertices);
+	m_data->m_convexIndicesGPU->copyFromHost(m_data->m_convexIndices);
+    
+    
+	return m_data->m_numAcceleratedShapes++;
+}
+
+
+int		btGpuNarrowPhase::registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling)
+{
+	btAlignedObjectArray<btVector3> verts;
+	
+	unsigned char* vts = (unsigned char*) vertices;
+	for (int i=0;i<numVertices;i++)
+	{
+		float* vertex = (float*) &vts[i*strideInBytes];
+		verts.push_back(btVector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2]));
+	}
+
+	btConvexUtility* utilPtr = new btConvexUtility();
+	bool merge = true;
+	if (numVertices)
+	{
+		utilPtr->initializePolyhedralFeatures(&verts[0],verts.size(),merge);
+	}
+
+	int collidableIndex = registerConvexHullShape(utilPtr);
+	return collidableIndex;
+}
+
+int		btGpuNarrowPhase::registerConvexHullShape(btConvexUtility* utilPtr)
+{
+	int collidableIndex = allocateCollidable();
+	btCollidable& col = getCollidableCpu(collidableIndex);
+	col.m_shapeType = SHAPE_CONVEX_HULL;
+	col.m_shapeIndex = -1;
+	
+	
+	{
+		btVector3 localCenter(0,0,0);
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+			localCenter+=utilPtr->m_vertices[i];
+		localCenter*= (1.f/utilPtr->m_vertices.size());
+		utilPtr->m_localCenter = localCenter;
+
+		col.m_shapeIndex = registerConvexHullShape(utilPtr,col);
+	}
+
+	if (col.m_shapeIndex>=0)
+	{
+		btSapAabb aabb;
+		
+		btVector3 myAabbMin(1e30f,1e30f,1e30f);
+		btVector3 myAabbMax(-1e30f,-1e30f,-1e30f);
+
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+		{
+			myAabbMin.setMin(utilPtr->m_vertices[i]);
+			myAabbMax.setMax(utilPtr->m_vertices[i]);
+		}
+		aabb.m_min[0] = myAabbMin[0];
+		aabb.m_min[1] = myAabbMin[1];
+		aabb.m_min[2] = myAabbMin[2];
+		aabb.m_minIndices[3] = 0;
+
+		aabb.m_max[0] = myAabbMax[0];
+		aabb.m_max[1] = myAabbMax[1];
+		aabb.m_max[2] = myAabbMax[2];
+		aabb.m_signedMaxIndices[3] = 0;
+
+		m_data->m_localShapeAABBCPU->push_back(aabb);
+		m_data->m_localShapeAABBGPU->push_back(aabb);
+	}
+	
+	return collidableIndex;
+
+}
+
+
+
+
+
+
+
+cl_mem	btGpuNarrowPhase::getBodiesGpu()
+{
+	return (cl_mem)m_data->m_bodyBufferGPU->getBufferCL();
+}
+
+
+int	btGpuNarrowPhase::getNumBodiesGpu() const
+{
+	return m_data->m_bodyBufferGPU->size();
+}
+
+cl_mem	btGpuNarrowPhase::getBodyInertiasGpu()
+{
+	return (cl_mem)m_data->m_inertiaBufferGPU->getBufferCL();
+}
+
+int	btGpuNarrowPhase::getNumBodyInertiasGpu() const
+{
+	return m_data->m_inertiaBufferGPU->size();
+}
+
+
+btCollidable& btGpuNarrowPhase::getCollidableCpu(int collidableIndex)
+{
+	return m_data->m_collidablesCPU[collidableIndex];
+}
+
+const btCollidable& btGpuNarrowPhase::getCollidableCpu(int collidableIndex) const
+{
+	return m_data->m_collidablesCPU[collidableIndex];
+}
+
+cl_mem btGpuNarrowPhase::getCollidablesGpu()
+{
+	return m_data->m_collidablesGPU->getBufferCL();
+}
+
+int	btGpuNarrowPhase::getNumCollidablesGpu() const
+{
+	return m_data->m_collidablesGPU->size();
+}
+
+
+
+
+
+int	btGpuNarrowPhase::getNumContactsGpu() const
+{
+	return m_data->m_pBufContactOutGPU->size();
+}
+cl_mem btGpuNarrowPhase::getContactsGpu()
+{
+	return m_data->m_pBufContactOutGPU->getBufferCL();
+}
+
+
+void btGpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbs, int numObjects)
+{
+
+}
+
+const btSapAabb& btGpuNarrowPhase::getLocalSpaceAabb(int collidableIndex) const
+{
+	return m_data->m_localShapeAABBCPU->at(collidableIndex);
+}
+
+
+
+
+
+int btGpuNarrowPhase::registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation , const float* aabbMinPtr, const float* aabbMaxPtr,bool writeToGpu)
+{
+	btVector3 aabbMin(aabbMinPtr[0],aabbMinPtr[1],aabbMinPtr[2]);
+	btVector3 aabbMax (aabbMaxPtr[0],aabbMaxPtr[1],aabbMaxPtr[2]);
+	
+	btAssert(m_data->m_numAcceleratedRigidBodies< (m_data->m_config.m_maxConvexBodies-1));
+    
+	m_data->m_bodyBufferGPU->resize(m_data->m_numAcceleratedRigidBodies+1);
+    
+	btRigidBodyCL& body = m_data->m_bodyBufferCPU->at(m_data->m_numAcceleratedRigidBodies);
+    
+	float friction = 1.f;
+	float restitution = 0.f;
+    
+	body.m_frictionCoeff = friction;
+	body.m_restituitionCoeff = restitution;
+	body.m_angVel.setZero();
+	body.m_linVel.setValue(0,-1,0);//.setZero();
+	body.m_pos.setValue(position[0],position[1],position[2]);
+	body.m_quat.setValue(orientation[0],orientation[1],orientation[2],orientation[3]);
+	body.m_collidableIdx = collidableIndex;
+	if (collidableIndex>=0)
+	{
+//		body.m_shapeType = m_data->m_collidablesCPU.at(collidableIndex).m_shapeType;
+	} else
+	{
+	//	body.m_shapeType = CollisionShape::SHAPE_PLANE;
+		m_planeBodyIndex = m_data->m_numAcceleratedRigidBodies;
+	}
+	//body.m_shapeType = shapeType;
+	
+	
+	body.m_invMass = mass? 1.f/mass : 0.f;
+    
+	if (writeToGpu)
+	{
+		m_data->m_bodyBufferGPU->copyFromHostPointer(&body,1,m_data->m_numAcceleratedRigidBodies);
+	}
+    
+	btInertiaCL& shapeInfo = m_data->m_inertiaBufferCPU->at(m_data->m_numAcceleratedRigidBodies);
+    
+	if (mass==0.f)
+	{
+		if (m_data->m_numAcceleratedRigidBodies==0)
+			m_static0Index = 0;
+        
+		shapeInfo.m_initInvInertia.setValue(0,0,0,0,0,0,0,0,0);
+		shapeInfo.m_invInertiaWorld.setValue(0,0,0,0,0,0,0,0,0);
+	} else
+	{
+        
+		assert(body.m_collidableIdx>=0);
+        
+		//approximate using the aabb of the shape
+        
+		//Aabb aabb = (*m_data->m_shapePointers)[shapeIndex]->m_aabb;
+		btVector3 halfExtents = (aabbMax-aabbMin);//*0.5f;//fake larger inertia makes demos more stable ;-)
+        
+		btVector3 localInertia;
+        
+		float lx=2.f*halfExtents[0];
+		float ly=2.f*halfExtents[1];
+		float lz=2.f*halfExtents[2];
+        
+		localInertia.setValue( (mass/12.0f) * (ly*ly + lz*lz),
+                                   (mass/12.0f) * (lx*lx + lz*lz),
+                                   (mass/12.0f) * (lx*lx + ly*ly));
+        
+		btVector3 invLocalInertia;
+		invLocalInertia[0] = 1.f/localInertia[0];
+		invLocalInertia[1] = 1.f/localInertia[1];
+		invLocalInertia[2] = 1.f/localInertia[2];
+		invLocalInertia[3] = 0.f;
+        
+		shapeInfo.m_initInvInertia.setValue(
+			invLocalInertia[0],		0,						0,
+			0,						invLocalInertia[1],		0,
+			0,						0,						invLocalInertia[2]);
+
+		btMatrix3x3 m (body.m_quat);
+
+		shapeInfo.m_invInertiaWorld = m.scaled(invLocalInertia) * m.transpose();
+        
+	}
+    
+	if (writeToGpu)
+		m_data->m_inertiaBufferGPU->copyFromHostPointer(&shapeInfo,1,m_data->m_numAcceleratedRigidBodies);
+    
+    
+    
+	return m_data->m_numAcceleratedRigidBodies++;
+}
+
+void	btGpuNarrowPhase::writeAllBodiesToGpu()
+{
+	m_data->m_bodyBufferGPU->resize(m_data->m_numAcceleratedRigidBodies);
+	m_data->m_inertiaBufferGPU->resize(m_data->m_numAcceleratedRigidBodies);
+    
+	m_data->m_bodyBufferGPU->copyFromHostPointer(&m_data->m_bodyBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies);
+	m_data->m_inertiaBufferGPU->copyFromHostPointer(&m_data->m_inertiaBufferCPU->at(0),m_data->m_numAcceleratedRigidBodies);
+    
+	m_data->m_collidablesGPU->copyFromHost(m_data->m_collidablesCPU);
+    
+}
diff --git a/opencl/gpu_rigidbody/host/btGpuNarrowPhase.h b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.h
new file mode 100644
index 000000000..face8b290
--- /dev/null
+++ b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.h
@@ -0,0 +1,76 @@
+#ifndef BT_GPU_NARROWPHASE_H
+#define BT_GPU_NARROWPHASE_H
+
+#include "../../gpu_sat/host/btCollidable.h"
+#include "basic_initialize/btOpenCLInclude.h"
+#include "BulletCommon/btAlignedObjectArray.h"
+#include "BulletCommon/btVector3.h"
+
+class btGpuNarrowPhase
+{
+protected:
+
+	struct btGpuNarrowPhaseInternalData*	m_data;
+	int m_acceleratedCompanionShapeIndex;
+	int m_planeBodyIndex;
+	int	m_static0Index;
+
+	cl_context m_context;
+	cl_device_id m_device;
+	cl_command_queue m_queue;
+
+
+public:
+
+	
+
+
+	btGpuNarrowPhase(cl_context vtx, cl_device_id dev, cl_command_queue q, const struct btConfig& config);
+
+	virtual ~btGpuNarrowPhase(void);
+
+	
+	int registerCompoundShape(btAlignedObjectArray<btGpuChildShape>* childShapes);
+	int registerFace(const btVector3& faceNormal, float faceConstant);
+	int registerConcaveMeshShape(btAlignedObjectArray<btVector3>* vertices, btAlignedObjectArray<int>* indices, btCollidable& col, const float* scaling);
+	int registerConcaveMeshShape(class objLoader* obj, btCollidable& col, const float* scaling);
+	
+	//do they need to be merged?
+	int registerConvexHullShape(class btConvexUtility* convexPtr, btCollidable& col);
+	int	registerConvexHullShape(btConvexUtility* utilPtr);
+	int	registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
+
+	int registerConvexHeightfield(class ConvexHeightField* convexShape,btCollidable& col);
+	int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
+	void setObjectTransform(const float* position, const float* orientation , int bodyIndex);
+
+	void	writeAllBodiesToGpu();
+
+	void	readbackAllBodiesToCpu();
+	void	getObjectTransformFromCpu(float* position, float* orientation , int bodyIndex) const;
+
+	virtual void computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbs, int numObjects);
+	
+
+	cl_mem	getBodiesGpu();
+	int	getNumBodiesGpu() const;
+
+	cl_mem	getBodyInertiasGpu();
+	int	getNumBodyInertiasGpu() const;
+
+	cl_mem	getCollidablesGpu();
+	int		getNumCollidablesGpu() const;
+
+	cl_mem	getContactsGpu();
+	int	getNumContactsGpu() const;
+
+	int allocateCollidable();
+
+	btCollidable& getCollidableCpu(int collidableIndex);
+	const btCollidable& getCollidableCpu(int collidableIndex) const;
+
+	const struct btSapAabb& getLocalSpaceAabb(int collidableIndex) const;
+};
+
+#endif //BT_GPU_NARROWPHASE_H
+
diff --git a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
new file mode 100644
index 000000000..3e43b6236
--- /dev/null
+++ b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
@@ -0,0 +1,171 @@
+#include "btGpuRigidBodyPipeline.h"
+#include "btGpuRigidBodyPipelineInternalData.h"
+#include "../kernels/integrateKernel.h"
+#include "../../basic_initialize/btOpenCLUtils.h"
+#include "btGpuNarrowPhase.h"
+#include "BulletGeometry/btAabbUtil2.h"
+#include "../../gpu_broadphase/host/btSapAabb.h"
+#include "../../gpu_broadphase/host/btGpuSapBroadphase.h"
+#include "parallel_primitives/host/btLauncherCL.h"
+
+btGpuRigidBodyPipeline::btGpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue  q,class btGpuNarrowPhase* narrowphase, class btGpuSapBroadphase* broadphaseSap )
+{
+	m_data = new btGpuRigidBodyPipelineInternalData;
+	m_data->m_context = ctx;
+	m_data->m_device = device;
+	m_data->m_queue = q;
+
+	m_data->m_broadphaseSap = broadphaseSap;
+	m_data->m_narrowphase = narrowphase;
+
+	cl_int errNum=0;
+
+	{
+		cl_program prog = btOpenCLUtils::compileCLProgramFromString(m_data->m_context,m_data->m_device,integrateKernelCL,&errNum,"","opencl/gpu_rigidbody/kernels/integrateKernel.cl");
+		btAssert(errNum==CL_SUCCESS);
+		m_data->m_integrateTransformsKernel = btOpenCLUtils::compileCLKernelFromString(m_data->m_context, m_data->m_device,integrateKernelCL, "integrateTransformsKernel",&errNum,prog);
+		btAssert(errNum==CL_SUCCESS);
+		clReleaseProgram(prog);
+	}
+
+
+}
+
+btGpuRigidBodyPipeline::~btGpuRigidBodyPipeline()
+{
+	clReleaseKernel(m_data->m_integrateTransformsKernel);
+
+	delete m_data;
+}
+
+void	btGpuRigidBodyPipeline::stepSimulation(float deltaTime)
+{
+	btLauncherCL launcher(m_data->m_queue,m_data->m_integrateTransformsKernel);
+	//integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping)
+
+	launcher.setBuffer(m_data->m_narrowphase->getBodiesGpu());
+	int numBodies = m_data->m_narrowphase->getNumBodiesGpu();
+	launcher.setConst(numBodies);
+	float timeStep = 1./60.f;
+	launcher.setConst(timeStep);
+	float angularDamp = 0.99f;
+	launcher.setConst(angularDamp);
+	launcher.launch1D(numBodies);
+
+}
+
+
+cl_mem	btGpuRigidBodyPipeline::getBodyBuffer()
+{
+	return m_data->m_narrowphase->getBodiesGpu();
+}
+
+int	btGpuRigidBodyPipeline::getNumBodies() const
+{
+	return m_data->m_narrowphase->getNumBodiesGpu();
+}
+
+
+
+int		btGpuRigidBodyPipeline::registerConvexPolyhedron(btConvexUtility* utilPtr)
+{
+/*
+	int collidableIndex = m_narrowphaseAndSolver->allocateCollidable();
+
+	btCollidable& col = m_narrowphaseAndSolver->getCollidableCpu(collidableIndex);
+	col.m_shapeType = CollisionShape::SHAPE_CONVEX_HULL;
+	col.m_shapeIndex = -1;
+	
+
+	if (m_narrowphaseAndSolver)
+	{
+		btVector3 localCenter(0,0,0);
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+			localCenter+=utilPtr->m_vertices[i];
+		localCenter*= (1.f/utilPtr->m_vertices.size());
+		utilPtr->m_localCenter = localCenter;
+
+		col.m_shapeIndex = m_narrowphaseAndSolver->registerConvexHullShape(utilPtr,col);
+	}
+
+	if (col.m_shapeIndex>=0)
+	{
+		btAABBHost aabbMin, aabbMax;
+		btVector3 myAabbMin(1e30f,1e30f,1e30f);
+		btVector3 myAabbMax(-1e30f,-1e30f,-1e30f);
+
+		for (int i=0;i<utilPtr->m_vertices.size();i++)
+		{
+			myAabbMin.setMin(utilPtr->m_vertices[i]);
+			myAabbMax.setMax(utilPtr->m_vertices[i]);
+		}
+		aabbMin.fx = myAabbMin[0];//s_convexHeightField->m_aabb.m_min.x;
+		aabbMin.fy = myAabbMin[1];//s_convexHeightField->m_aabb.m_min.y;
+		aabbMin.fz= myAabbMin[2];//s_convexHeightField->m_aabb.m_min.z;
+		aabbMin.uw = 0;
+
+		aabbMax.fx = myAabbMax[0];//s_convexHeightField->m_aabb.m_max.x;
+		aabbMax.fy = myAabbMax[1];//s_convexHeightField->m_aabb.m_max.y;
+		aabbMax.fz= myAabbMax[2];//s_convexHeightField->m_aabb.m_max.z;
+		aabbMax.uw = 0;
+
+		m_data->m_localShapeAABBCPU->push_back(aabbMin);
+		m_data->m_localShapeAABBGPU->push_back(aabbMin);
+
+		m_data->m_localShapeAABBCPU->push_back(aabbMax);
+		m_data->m_localShapeAABBGPU->push_back(aabbMax);
+
+		//m_data->m_localShapeAABB->copyFromHostPointer(&aabbMin,1,shapeIndex*2);
+		//m_data->m_localShapeAABB->copyFromHostPointer(&aabbMax,1,shapeIndex*2+1);
+		clFinish(g_cqCommandQue);
+	}
+
+	delete[] eqn;
+	
+	
+	
+	return collidableIndex;
+	*/
+	return 0;
+}
+
+
+int		btGpuRigidBodyPipeline::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collidableIndex, int userIndex)
+{
+	btVector3 aabbMin(0,0,0),aabbMax(0,0,0);
+	if (collidableIndex>=0)
+	{
+		btSapAabb localAabb = m_data->m_narrowphase->getLocalSpaceAabb(collidableIndex);
+		btVector3 localAabbMin(localAabb.m_min[0],localAabb.m_min[1],localAabb.m_min[2]);
+		btVector3 localAabbMax(localAabb.m_max[0],localAabb.m_max[1],localAabb.m_max[2]);
+		
+		btScalar margin = 0.01f;
+		btTransform t;
+		t.setIdentity();
+		t.setOrigin(btVector3(position[0],position[1],position[2]));
+		t.setRotation(btQuaternion(orientation[0],orientation[1],orientation[2],orientation[3]));
+		btTransformAabb(localAabbMin,localAabbMax, margin,t,aabbMin,aabbMax);
+		if (mass)
+		{
+			m_data->m_broadphaseSap->createProxy(aabbMin,aabbMax,userIndex,1,1);//m_dispatcher);
+		} else
+		{
+			m_data->m_broadphaseSap->createLargeProxy(aabbMin,aabbMax,userIndex,1,1);//m_dispatcher);	
+		}
+	}
+			
+	bool writeToGpu = false;
+	int bodyIndex = -1;
+
+
+	bodyIndex = m_data->m_narrowphase->registerRigidBody(collidableIndex,mass,position,orientation,&aabbMin.getX(),&aabbMax.getX(),writeToGpu);
+
+	/*
+	if (mass>0.f)
+		m_numDynamicPhysicsInstances++;
+
+	m_numPhysicsInstances++;
+	*/
+
+	return bodyIndex;
+}
\ No newline at end of file
diff --git a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.h b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.h
new file mode 100644
index 000000000..1fae98a50
--- /dev/null
+++ b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.h
@@ -0,0 +1,38 @@
+#ifndef BT_GPU_RIGIDBODY_PIPELINE_H
+#define BT_GPU_RIGIDBODY_PIPELINE_H
+
+#include "../../basic_initialize/btOpenCLInclude.h"
+
+class btGpuRigidBodyPipeline
+{
+protected:
+	struct btGpuRigidBodyPipelineInternalData*	m_data;
+
+	int allocateCollidable();
+
+public:
+
+	btGpuRigidBodyPipeline(cl_context ctx,cl_device_id device, cl_command_queue  q , class btGpuNarrowPhase* narrowphase, class btGpuSapBroadphase* broadphaseSap);
+	virtual ~btGpuRigidBodyPipeline();
+
+	void	stepSimulation(float deltaTime);
+
+	int		registerConvexPolyhedron(class btConvexUtility* convex);
+
+	//int		registerConvexPolyhedron(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
+	//int		registerSphereShape(float radius);
+	//int		registerPlaneShape(const btVector3& planeNormal, float planeConstant);
+	
+	//int		registerConcaveMesh(btAlignedObjectArray<btVector3>* vertices, btAlignedObjectArray<int>* indices, const float* scaling);
+	//int		registerCompoundShape(btAlignedObjectArray<btGpuChildShape>* childShapes);
+
+	
+	int		registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, int userData);
+
+	cl_mem	getBodyBuffer();
+
+	int	getNumBodies() const;
+
+};
+
+#endif //BT_GPU_RIGIDBODY_PIPELINE_H
\ No newline at end of file
diff --git a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h
new file mode 100644
index 000000000..93925cb1a
--- /dev/null
+++ b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipelineInternalData.h
@@ -0,0 +1,26 @@
+#ifndef BT_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
+#define BT_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
+
+#include "../../basic_initialize/btOpenCLInclude.h"
+#include "BulletCommon/btAlignedObjectArray.h"
+
+#include "../../parallel_primitives/host/btOpenCLArray.h"
+#include "../../gpu_sat/host/btCollidable.h"
+
+struct btGpuRigidBodyPipelineInternalData
+{
+
+	cl_context			m_context;
+	cl_device_id		m_device;
+	cl_command_queue	m_queue;
+
+	cl_kernel	m_integrateTransformsKernel;
+
+	class btGpuSapBroadphase* m_broadphaseSap;
+
+	class btGpuNarrowPhase*	m_narrowphase;
+	
+};
+
+#endif //BT_GPU_RIGIDBODY_PIPELINE_INTERNAL_DATA_H
+
diff --git a/opencl/gpu_rigidbody/kernels/integrateKernel.cl b/opencl/gpu_rigidbody/kernels/integrateKernel.cl
new file mode 100644
index 000000000..5b5bff268
--- /dev/null
+++ b/opencl/gpu_rigidbody/kernels/integrateKernel.cl
@@ -0,0 +1,89 @@
+
+float4 quatMult(float4 q1, float4 q2)
+{
+	float4 q;
+	q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;
+	q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;
+	q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;
+	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; 
+	return q;
+}
+
+float4 quatNorm(float4 q)
+{
+	float len = native_sqrt(dot(q, q));
+	if(len > 0.f)
+	{
+		q *= 1.f / len;
+	}
+	else
+	{
+		q.x = q.y = q.z = 0.f;
+		q.w = 1.f;
+	}
+	return q;
+}
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	unsigned int m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+
+
+
+
+__kernel void 
+  integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping)
+{
+	int nodeID = get_global_id(0);
+	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
+	if( nodeID < numNodes )
+	{
+		if (1)
+		{
+			float4 axis;
+			//add some hardcoded angular damping
+			bodies[nodeID].m_angVel.x *= angularDamping;
+			bodies[nodeID].m_angVel.y *= angularDamping;
+			bodies[nodeID].m_angVel.z *= angularDamping;
+			
+			float4 angvel = bodies[nodeID].m_angVel;
+			float fAngle = native_sqrt(dot(angvel, angvel));
+			//limit the angular motion
+			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
+			{
+				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
+			}
+			if(fAngle < 0.001f)
+			{
+				// use Taylor's expansions of sync function
+				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
+			}
+			else
+			{
+				// sync(fAngle) = sin(c*fAngle)/t
+				axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);
+			}
+			float4 dorn = axis;
+			dorn.w = native_cos(fAngle * timeStep * 0.5f);
+			float4 orn0 = bodies[nodeID].m_quat;
+
+			float4 predictedOrn = quatMult(dorn, orn0);
+			predictedOrn = quatNorm(predictedOrn);
+			bodies[nodeID].m_quat=predictedOrn;
+		}
+
+	//linear velocity		
+		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;
+		
+	}
+}
diff --git a/opencl/gpu_rigidbody/kernels/integrateKernel.h b/opencl/gpu_rigidbody/kernels/integrateKernel.h
new file mode 100644
index 000000000..6ace9b1aa
--- /dev/null
+++ b/opencl/gpu_rigidbody/kernels/integrateKernel.h
@@ -0,0 +1,93 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* integrateKernelCL= \
+"\n"
+"float4 quatMult(float4 q1, float4 q2)\n"
+"{\n"
+"	float4 q;\n"
+"	q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;\n"
+"	q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;\n"
+"	q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;\n"
+"	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; \n"
+"	return q;\n"
+"}\n"
+"\n"
+"float4 quatNorm(float4 q)\n"
+"{\n"
+"	float len = native_sqrt(dot(q, q));\n"
+"	if(len > 0.f)\n"
+"	{\n"
+"		q *= 1.f / len;\n"
+"	}\n"
+"	else\n"
+"	{\n"
+"		q.x = q.y = q.z = 0.f;\n"
+"		q.w = 1.f;\n"
+"	}\n"
+"	return q;\n"
+"}\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"\n"
+"	unsigned int m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} Body;\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void \n"
+"  integrateTransformsKernel( __global Body* bodies,const int numNodes, float timeStep, float angularDamping)\n"
+"{\n"
+"	int nodeID = get_global_id(0);\n"
+"	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);\n"
+"	if( nodeID < numNodes )\n"
+"	{\n"
+"		if (1)\n"
+"		{\n"
+"			float4 axis;\n"
+"			//add some hardcoded angular damping\n"
+"			bodies[nodeID].m_angVel.x *= angularDamping;\n"
+"			bodies[nodeID].m_angVel.y *= angularDamping;\n"
+"			bodies[nodeID].m_angVel.z *= angularDamping;\n"
+"			\n"
+"			float4 angvel = bodies[nodeID].m_angVel;\n"
+"			float fAngle = native_sqrt(dot(angvel, angvel));\n"
+"			//limit the angular motion\n"
+"			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)\n"
+"			{\n"
+"				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;\n"
+"			}\n"
+"			if(fAngle < 0.001f)\n"
+"			{\n"
+"				// use Taylor's expansions of sync function\n"
+"				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);\n"
+"			}\n"
+"			else\n"
+"			{\n"
+"				// sync(fAngle) = sin(c*fAngle)/t\n"
+"				axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);\n"
+"			}\n"
+"			float4 dorn = axis;\n"
+"			dorn.w = native_cos(fAngle * timeStep * 0.5f);\n"
+"			float4 orn0 = bodies[nodeID].m_quat;\n"
+"\n"
+"			float4 predictedOrn = quatMult(dorn, orn0);\n"
+"			predictedOrn = quatNorm(predictedOrn);\n"
+"			bodies[nodeID].m_quat=predictedOrn;\n"
+"		}\n"
+"\n"
+"	//linear velocity		\n"
+"		bodies[nodeID].m_pos +=  bodies[nodeID].m_linVel * timeStep;\n"
+"		\n"
+"	}\n"
+"}\n"
+"\n"
+;
diff --git a/opencl/gpu_sat/host/btCollidable.h b/opencl/gpu_sat/host/btCollidable.h
index 6209671de..e9633ca42 100644
--- a/opencl/gpu_sat/host/btCollidable.h
+++ b/opencl/gpu_sat/host/btCollidable.h
@@ -2,6 +2,18 @@
 #ifndef BT_COLLIDABLE_H
 #define BT_COLLIDABLE_H
 
+enum btShapeTypes
+{
+	SHAPE_HEIGHT_FIELD=1,
+	SHAPE_CONVEX_HEIGHT_FIELD=2,
+	SHAPE_CONVEX_HULL=3,
+	SHAPE_PLANE=4,
+	SHAPE_CONCAVE_TRIMESH=5,
+	SHAPE_COMPOUND_OF_CONVEX_HULLS=6,
+	SHAPE_SPHERE=7,
+	MAX_NUM_SHAPE_TYPES,
+};
+
 struct btCollidable
 {
 	int m_numChildShapes;
diff --git a/opencl/gpu_sat/host/btRigidBodyCL.h b/opencl/gpu_sat/host/btRigidBodyCL.h
index 4a2c360f8..e91e4ad09 100644
--- a/opencl/gpu_sat/host/btRigidBodyCL.h
+++ b/opencl/gpu_sat/host/btRigidBodyCL.h
@@ -25,7 +25,7 @@ ATTRIBUTE_ALIGNED16(struct) btRigidBodyCL
 };
 
 
-struct Inertia
+struct btInertiaCL
 {
 	btMatrix3x3 m_invInertiaWorld;
 	btMatrix3x3 m_initInvInertia;