diff --git a/build/stringify.bat b/build/stringify.bat
index 5c45bd8a5..ba3526ee3 100644
--- a/build/stringify.bat
+++ b/build/stringify.bat
@@ -13,6 +13,8 @@ premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_broadphase/kerne
 
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/sat.cl" --headerfile="../opencl/gpu_sat/kernels/satKernels.h" --stringname="satKernelsCL" stringify
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/satClipHullContacts.cl" --headerfile="../opencl/gpu_sat/kernels/satClipHullContacts.h" --stringname="satClipKernelsCL" stringify
+premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_sat/kernels/bvhTraversal.cl" --headerfile="../opencl/gpu_sat/kernels/bvhTraversal.h" --stringname="bvhTraversalKernelCL" stringify
+
 
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/integrateKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/integrateKernel.h" --stringname="integrateKernelCL" stringify
 premake4 --file=stringifyKernel.lua --kernelfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.cl" --headerfile="../opencl/gpu_rigidbody/kernels/updateAabbsKernel.h" --stringname="updateAabbsKernelCL" stringify
diff --git a/demo/gpudemo/GpuDemo.h b/demo/gpudemo/GpuDemo.h
index 081632724..4c07e3d07 100644
--- a/demo/gpudemo/GpuDemo.h
+++ b/demo/gpudemo/GpuDemo.h
@@ -16,36 +16,36 @@ public:
 	typedef class GpuDemo* (CreateFunc)();
 
 	struct ConstructionInfo
-        {
-                bool useOpenCL;
-                int preferredOpenCLPlatformIndex;
-                int preferredOpenCLDeviceIndex;
-                int arraySizeX;
-                int arraySizeY;
-                int arraySizeZ;
-                bool m_useConcaveMesh;
-                float gapX;
-                float gapY;
-                float gapZ;
-                GLInstancingRenderer*   m_instancingRenderer;
-				class btgWindowInterface*	m_window;
+    {
+            bool useOpenCL;
+            int preferredOpenCLPlatformIndex;
+            int preferredOpenCLDeviceIndex;
+            int arraySizeX;
+            int arraySizeY;
+            int arraySizeZ;
+            bool m_useConcaveMesh;
+            float gapX;
+            float gapY;
+            float gapZ;
+            GLInstancingRenderer*   m_instancingRenderer;
+			class btgWindowInterface*	m_window;
 
-                ConstructionInfo()
-                        :useOpenCL(true),
-                        preferredOpenCLPlatformIndex(-1),
-                        preferredOpenCLDeviceIndex(-1),
-						arraySizeX(23),
-			arraySizeY(23 ),
-			arraySizeZ(23),
-			m_useConcaveMesh(false),
-			gapX(4.3),
-			gapY(2.0),
-			gapZ(4.3),
-                        m_instancingRenderer(0),
-						m_window(0)
-                {
-                }
-        };
+            ConstructionInfo()
+                    :useOpenCL(true),
+                    preferredOpenCLPlatformIndex(-1),
+                    preferredOpenCLDeviceIndex(-1),
+					arraySizeX(5),
+		arraySizeY(5 ),
+		arraySizeZ(5),
+		m_useConcaveMesh(false),
+		gapX(4.3),
+		gapY(2.0),
+		gapZ(4.3),
+                    m_instancingRenderer(0),
+					m_window(0)
+            {
+            }
+    };
 
 	GpuDemo();
 	virtual ~GpuDemo();
diff --git a/demo/gpudemo/ParticleDemo.cpp b/demo/gpudemo/ParticleDemo.cpp
index 0f2cad18d..a603dcfd9 100644
--- a/demo/gpudemo/ParticleDemo.cpp
+++ b/demo/gpudemo/ParticleDemo.cpp
@@ -380,7 +380,7 @@ void ParticleDemo::clientMoveAndDisplay()
 		{
 			btBufferInfoCL bInfo[] = { 
 				btBufferInfoCL( m_data->m_clPositionBuffer),
-				btBufferInfoCL( m_data->m_broadphaseGPU->getAabbBuffer()),
+				btBufferInfoCL( m_data->m_broadphaseGPU->getAabbBufferWS()),
 			};
 			
 			btLauncherCL launcher(m_clData->m_clQueue, m_data->m_updateAabbsKernel );
diff --git a/demo/gpudemo/broadphase/PairBench.cpp b/demo/gpudemo/broadphase/PairBench.cpp
index 50b4a9fb3..abaf4fc7c 100644
--- a/demo/gpudemo/broadphase/PairBench.cpp
+++ b/demo/gpudemo/broadphase/PairBench.cpp
@@ -301,7 +301,7 @@ void PairBench::clientMoveAndDisplay()
 		btLauncherCL launcher(m_clData->m_clQueue, m_data->m_updateAabbSimple);
 			launcher.setBuffer(m_data->m_instancePosOrnColor->getBufferCL() );
 			launcher.setConst( numObjects);
-			launcher.setBuffer(m_data->m_broadphaseGPU->getAabbBuffer());
+			launcher.setBuffer(m_data->m_broadphaseGPU->getAabbBufferWS());
 			launcher.launch1D( numObjects);
 			clFinish(m_clData->m_clQueue);
 		
diff --git a/demo/gpudemo/rigidbody/ConcaveScene.cpp b/demo/gpudemo/rigidbody/ConcaveScene.cpp
index 29fd244d1..3c99d6b3d 100644
--- a/demo/gpudemo/rigidbody/ConcaveScene.cpp
+++ b/demo/gpudemo/rigidbody/ConcaveScene.cpp
@@ -27,9 +27,9 @@ struct GraphicsVertex
 };
 struct GraphicsShape
 {
-	const float*	m_vertices;
+	btAlignedObjectArray<GraphicsVertex>*	m_vertices;
 	int				m_numvertices;
-	const int*		m_indices;
+	btAlignedObjectArray<int>* 		m_indices;
 	int				m_numIndices;
 	float			m_scaling[4];
 };
@@ -153,9 +153,9 @@ GraphicsShape* createGraphicsShapeFromWavefrontObj(objLoader* obj)
 		
 		
 		GraphicsShape* gfxShape = new GraphicsShape;
-		gfxShape->m_vertices = &vertices->at(0).xyzw[0];
+		gfxShape->m_vertices = vertices;
 		gfxShape->m_numvertices = vertices->size();
-		gfxShape->m_indices = &indicesPtr->at(0);
+		gfxShape->m_indices = indicesPtr;
 		gfxShape->m_numIndices = indicesPtr->size();
 		for (int i=0;i<4;i++)
 			gfxShape->m_scaling[i] = 1;//bake the scaling into the vertices 
@@ -167,8 +167,8 @@ void ConcaveScene::setupScene(const ConstructionInfo& ci)
 {
 	objLoader* objData = new objLoader();
 	//char* fileName = "data/plane.obj";
-	//char* fileName = "data/teddy.obj";//"plane.obj";
-	char* fileName = "data/sponza_closed.obj";//"plane.obj";
+	char* fileName = "data/teddy.obj";//"plane.obj";
+	//char* fileName = "data/sponza_closed.obj";//"plane.obj";
 
 	FILE* f = 0;
 
@@ -196,29 +196,46 @@ void ConcaveScene::setupScene(const ConstructionInfo& ci)
 		return;
 
 	objData->load(relativeFileName);
-	
-	GraphicsShape* shape = createGraphicsShapeFromWavefrontObj(objData);
+	int index=10;
+
 	{
-		int strideInBytes = 9*sizeof(float);
-		int numVertices = sizeof(cube_vertices)/strideInBytes;
-		int numIndices = sizeof(cube_vertices)/sizeof(int);
-		//int shapeId = ci.m_instancingRenderer->registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
-		//int shapeId = ci.m_instancingRenderer->registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
-
-
-		int shapeId = ci.m_instancingRenderer->registerShape(shape->m_vertices, shape->m_numvertices, shape->m_indices, shape->m_numIndices);
-		btQuaternion orn(0,0,0,1);
-				
-		btVector4 color(0,1,0,1.f);//0.5);
+		GraphicsShape* shape = createGraphicsShapeFromWavefrontObj(objData);
 		btVector4 scaling(1,1,1,1);
-				
+
+		btAlignedObjectArray<btVector3> verts;
+		for (int i=0;i<shape->m_numvertices;i++)
 		{
-			btVector3 position(0,0,0);
-			int id = ci.m_instancingRenderer->registerGraphicsInstance(shapeId,position,orn,color,scaling);
+			btVector3 vtx = (btVector3&)shape->m_vertices->at(i).xyzw;
+			verts.push_back(vtx);
 		}
+	
+		int colIndex = m_data->m_np->registerConcaveMesh(&verts,shape->m_indices,scaling);
+		
+		{
+			int strideInBytes = 9*sizeof(float);
+			int numVertices = sizeof(cube_vertices)/strideInBytes;
+			int numIndices = sizeof(cube_vertices)/sizeof(int);
+			//int shapeId = ci.m_instancingRenderer->registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
+			//int shapeId = ci.m_instancingRenderer->registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
+
+
+			int shapeId = ci.m_instancingRenderer->registerShape(&shape->m_vertices->at(0).xyzw[0], shape->m_numvertices, &shape->m_indices->at(0), shape->m_numIndices);
+			btQuaternion orn(0,0,0,1);
+				
+			btVector4 color(0,1,0,1.f);//0.5);
+	
+				
+			{
+				float mass = 0.f;
+				btVector3 position(0,0,0);
+				int id = ci.m_instancingRenderer->registerGraphicsInstance(shapeId,position,orn,color,scaling);
+				int pid = m_data->m_rigidBodyPipeline->registerPhysicsInstance(mass,position,orn,colIndex,index);
+				index++;
+			}
 
 		
 
+		}
 	}
 
 	int strideInBytes = 9*sizeof(float);
@@ -228,37 +245,39 @@ void ConcaveScene::setupScene(const ConstructionInfo& ci)
 	int shapeId = ci.m_instancingRenderer->registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
 	int group=1;
 	int mask=1;
-	int index=10;
-	float scaling[4] = {1,1,1,1};
+	
+	
 
-	int colIndex = m_data->m_np->registerConvexHullShape(&cube_vertices[0],strideInBytes,numVertices, scaling);
+	
 
-	if (0)
+	if (1)
 	{
-	for (int i=0;i<1;i++)
-	{
-		for (int j=0;j<ci.arraySizeY;j++)
+		btVector4 scaling(1,1,1,1);
+		int colIndex = m_data->m_np->registerConvexHullShape(&cube_vertices[0],strideInBytes,numVertices, scaling);
+		for (int i=0;i<ci.arraySizeX;i++)
 		{
-			for (int k=0;k<1;k++)
+			for (int j=0;j<ci.arraySizeY;j++)
 			{
-				float mass = j==0? 0.f : 1.f;
+				for (int k=0;k<ci.arraySizeZ;k++)
+				{
+					float mass = 1;
 
-				btVector3 position(40+i*ci.gapX,j*ci.gapY,k*ci.gapZ);
-				btQuaternion orn(1,0,0,0);
+					btVector3 position(-2*ci.gapX+i*ci.gapX,25+j*ci.gapY,-2*ci.gapZ+k*ci.gapZ);
+					btQuaternion orn(1,0,0,0);
 				
-				btVector4 color(0,1,0,1);
-				btVector4 scaling(1,1,1,1);
-				int id = ci.m_instancingRenderer->registerGraphicsInstance(shapeId,position,orn,color,scaling);
-				int pid = m_data->m_rigidBodyPipeline->registerPhysicsInstance(mass,position,orn,colIndex,index);
+					btVector4 color(0,1,0,1);
 				
-				index++;
+					int id = ci.m_instancingRenderer->registerGraphicsInstance(shapeId,position,orn,color,scaling);
+					int pid = m_data->m_rigidBodyPipeline->registerPhysicsInstance(mass,position,orn,colIndex,index);
+				
+					index++;
+				}
 			}
 		}
 	}
-	}
 	float camPos[4]={0,0,0,0};//65.5,4.5,65.5,0};
 	//float camPos[4]={1,12.5,1.5,0};
 	m_instancingRenderer->setCameraTargetPosition(camPos);
-	m_instancingRenderer->setCameraDistance(10);
+	m_instancingRenderer->setCameraDistance(50);
 
 }
\ No newline at end of file
diff --git a/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp b/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
index f3e926798..900445597 100644
--- a/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
+++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
@@ -472,7 +472,7 @@ void btGpuSapBroadphase::createProxy(const btVector3& aabbMin,  const btVector3&
 	m_allAabbsCPU.push_back(aabb);
 }
 
-cl_mem	btGpuSapBroadphase::getAabbBuffer()
+cl_mem	btGpuSapBroadphase::getAabbBufferWS()
 {
 	return m_allAabbsGPU.getBufferCL();
 }
diff --git a/opencl/gpu_broadphase/host/btGpuSapBroadphase.h b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
index c119b8ce7..c1ccdcd6e 100644
--- a/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
+++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
@@ -53,7 +53,7 @@ class btGpuSapBroadphase
 	//call writeAabbsToGpu after done making all changes (createProxy etc)
 	void writeAabbsToGpu();
 
-	cl_mem	getAabbBuffer();
+	cl_mem	getAabbBufferWS();
 	int	getNumOverlap();
 	cl_mem	getOverlappingPairBuffer();
 };
diff --git a/opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp
index e9595bc0c..beec30c51 100644
--- a/opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp
+++ b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.cpp
@@ -7,6 +7,8 @@
 #include "../../gpu_broadphase/host/btSapAabb.h"
 #include <string.h>
 #include "btConfig.h"
+#include "../../gpu_sat/host/btOptimizedBvh.h"
+#include "../../gpu_sat/host/btTriangleIndexVertexArray.h"
 
 struct btGpuNarrowPhaseInternalData
 {
@@ -60,6 +62,7 @@ struct btGpuNarrowPhaseInternalData
 	btOpenCLArray<btSapAabb>* m_localShapeAABBGPU;
 	btAlignedObjectArray<btSapAabb>* m_localShapeAABBCPU;
 
+	btAlignedObjectArray<class btOptimizedBvh*> m_bvhData;
 	btConfig	m_config;
     
 };
@@ -323,8 +326,137 @@ int		btGpuNarrowPhase::registerConvexHullShape(btConvexUtility* utilPtr)
 }
 
 
+int		btGpuNarrowPhase::registerConcaveMesh(btAlignedObjectArray<btVector3>* vertices, btAlignedObjectArray<int>* indices,const float* scaling1)
+{
+	btVector3 scaling(scaling1[0],scaling1[1],scaling1[2]);
+
+	int collidableIndex = allocateCollidable();
+	btCollidable& col = getCollidableCpu(collidableIndex);
+	
+	col.m_shapeType = SHAPE_CONCAVE_TRIMESH;
+	col.m_shapeIndex = registerConcaveMeshShape(vertices,indices,col,scaling);
+
+	
+
+	btSapAabb aabb;
+	btVector3 myAabbMin(1e30f,1e30f,1e30f);
+	btVector3 myAabbMax(-1e30f,-1e30f,-1e30f);
+
+	for (int i=0;i<vertices->size();i++)
+	{
+		btVector3 vtx(vertices->at(i)*scaling);
+		myAabbMin.setMin(vtx);
+		myAabbMax.setMax(vtx);
+	}
+	aabb.m_min[0] = myAabbMin[0];
+	aabb.m_min[1] = myAabbMin[1];
+	aabb.m_min[2] = myAabbMin[2];
+	aabb.m_minIndices[3] = 0;
+
+	aabb.m_max[0] = myAabbMax[0];
+	aabb.m_max[1]= myAabbMax[1];
+	aabb.m_max[2]= myAabbMax[2];
+	aabb.m_signedMaxIndices[3]= 0;
+
+	m_data->m_localShapeAABBCPU->push_back(aabb);
+	m_data->m_localShapeAABBGPU->push_back(aabb);
+
+	btOptimizedBvh* bvh = new btOptimizedBvh();
+	//void btOptimizedBvh::build(btStridingMeshInterface* triangles, bool useQuantizedAabbCompression, const btVector3& bvhAabbMin, const btVector3& bvhAabbMax)
+	
+	bool useQuantizedAabbCompression = true;
+	btTriangleIndexVertexArray* meshInterface=new btTriangleIndexVertexArray();
+	btIndexedMesh mesh;
+	mesh.m_numTriangles = indices->size()/3;
+	mesh.m_numVertices = vertices->size();
+	mesh.m_vertexBase = (const unsigned char *)&vertices->at(0).getX();
+	mesh.m_vertexStride = sizeof(btVector3);
+	mesh.m_triangleIndexStride = 3 * sizeof(int);// or sizeof(int)
+	mesh.m_triangleIndexBase = (const unsigned char *)&indices->at(0);
+	
+	meshInterface->addIndexedMesh(mesh);
+	bvh->build(meshInterface, useQuantizedAabbCompression, (btVector3&)aabb.m_min, (btVector3&)aabb.m_max);
+	m_data->m_bvhData.push_back(bvh);
+
+	return collidableIndex;
+}
+
+int btGpuNarrowPhase::registerConcaveMeshShape(btAlignedObjectArray<btVector3>* vertices, btAlignedObjectArray<int>* indices,btCollidable& col, const float* scaling1)
+{
 
 
+	btVector3 scaling(scaling1[0],scaling1[1],scaling1[2]);
+
+	m_data->m_convexData->resize(m_data->m_numAcceleratedShapes+1);
+	m_data->m_convexPolyhedra.resize(m_data->m_numAcceleratedShapes+1);
+	
+    
+	btConvexPolyhedronCL& convex = m_data->m_convexPolyhedra.at(m_data->m_convexPolyhedra.size()-1);
+	convex.mC = btVector3(0,0,0);
+	convex.mE = btVector3(0,0,0);
+	convex.m_extents= btVector3(0,0,0);
+	convex.m_localCenter = btVector3(0,0,0);
+	convex.m_radius = 0.f;
+	
+	convex.m_numUniqueEdges = 0;
+	int edgeOffset = m_data->m_uniqueEdges.size();
+	convex.m_uniqueEdgesOffset = edgeOffset;
+	
+	int faceOffset = m_data->m_convexFaces.size();
+	convex.m_faceOffset = faceOffset;
+	
+	convex.m_numFaces = indices->size()/3;
+	m_data->m_convexFaces.resize(faceOffset+convex.m_numFaces);
+	m_data->m_convexIndices.reserve(convex.m_numFaces*3);
+	for (int i=0;i<convex.m_numFaces;i++)
+	{
+		if (i%256==0)
+		{
+		printf("i=%d out of %d", i,convex.m_numFaces);
+		}
+		btVector3 vert0(vertices->at(indices->at(i*3))*scaling);
+		btVector3 vert1(vertices->at(indices->at(i*3+1))*scaling);
+		btVector3 vert2(vertices->at(indices->at(i*3+2))*scaling);
+
+		btVector3 normal = ((vert1-vert0).cross(vert2-vert0)).normalize();
+		btScalar c = -(normal.dot(vert0));
+
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[0] = normal.x();
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[1] = normal.y();
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[2] = normal.z();
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_plane[3] = c;
+		int indexOffset = m_data->m_convexIndices.size();
+		int numIndices = 3;
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_numIndices = numIndices;
+		m_data->m_convexFaces[convex.m_faceOffset+i].m_indexOffset = indexOffset;
+		m_data->m_convexIndices.resize(indexOffset+numIndices);
+		for (int p=0;p<numIndices;p++)
+		{
+			int vi = indices->at(i*3+p);
+			m_data->m_convexIndices[indexOffset+p] = vi;//convexPtr->m_faces[i].m_indices[p];
+		}
+	}
+    
+	convex.m_numVertices = vertices->size();
+	int vertexOffset = m_data->m_convexVertices.size();
+	convex.m_vertexOffset =vertexOffset;
+	m_data->m_convexVertices.resize(vertexOffset+convex.m_numVertices);
+	for (int i=0;i<vertices->size();i++)
+	{
+		m_data->m_convexVertices[vertexOffset+i] = vertices->at(i)*scaling;
+	}
+
+	(*m_data->m_convexData)[m_data->m_numAcceleratedShapes] = 0;
+	
+	m_data->m_convexFacesGPU->copyFromHost(m_data->m_convexFaces);
+    
+	m_data->m_convexPolyhedraGPU->copyFromHost(m_data->m_convexPolyhedra);
+	m_data->m_uniqueEdgesGPU->copyFromHost(m_data->m_uniqueEdges);
+	m_data->m_convexVerticesGPU->copyFromHost(m_data->m_convexVertices);
+	m_data->m_convexIndicesGPU->copyFromHost(m_data->m_convexIndices);
+  
+	return m_data->m_numAcceleratedShapes++;
+}
 
 
 
@@ -388,7 +520,7 @@ cl_mem btGpuNarrowPhase::getContactsGpu()
 }
 
 
-void btGpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbs, int numObjects)
+void btGpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphasePairs, cl_mem aabbsWS, int numObjects)
 {
 	int nContactOut = 0;
 
@@ -399,7 +531,7 @@ void btGpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphase
 	btOpenCLArray<btInt2> broadphasePairsGPU(m_context,m_queue);
 	broadphasePairsGPU.setFromOpenCLBuffer(broadphasePairs,numBroadphasePairs);
 	btOpenCLArray<btYetAnotherAabb> clAabbArray(this->m_context,this->m_queue);
-	clAabbArray.setFromOpenCLBuffer(aabbs,numObjects);
+	clAabbArray.setFromOpenCLBuffer(aabbsWS,numObjects);
 
 	m_data->m_gpuSatCollision->computeConvexConvexContactsGPUSAT(
 		&broadphasePairsGPU, numBroadphasePairs,
@@ -419,7 +551,7 @@ void btGpuNarrowPhase::computeContacts(cl_mem broadphasePairs, int numBroadphase
 		*m_data->m_worldNormalsAGPU,
 		*m_data->m_worldVertsA1GPU,
 		*m_data->m_worldVertsB2GPU,
-
+		m_data->m_bvhData,
 		numObjects,
 		maxTriConvexPairCapacity,
 		triangleConvexPairs,
diff --git a/opencl/gpu_rigidbody/host/btGpuNarrowPhase.h b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.h
index ca4381a6e..50d030c57 100644
--- a/opencl/gpu_rigidbody/host/btGpuNarrowPhase.h
+++ b/opencl/gpu_rigidbody/host/btGpuNarrowPhase.h
@@ -19,6 +19,8 @@ protected:
 	cl_device_id m_device;
 	cl_command_queue m_queue;
 
+	int registerConvexHullShape(class btConvexUtility* convexPtr, btCollidable& col);
+	int registerConcaveMeshShape(btAlignedObjectArray<btVector3>* vertices, btAlignedObjectArray<int>* indices, btCollidable& col, const float* scaling);
 
 public:
 
@@ -32,15 +34,14 @@ public:
 	
 	int registerCompoundShape(btAlignedObjectArray<btGpuChildShape>* childShapes);
 	int registerFace(const btVector3& faceNormal, float faceConstant);
-	int registerConcaveMeshShape(btAlignedObjectArray<btVector3>* vertices, btAlignedObjectArray<int>* indices, btCollidable& col, const float* scaling);
-	int registerConcaveMeshShape(class objLoader* obj, btCollidable& col, const float* scaling);
+	
+	int	registerConcaveMesh(btAlignedObjectArray<btVector3>* vertices, btAlignedObjectArray<int>* indices,const float* scaling);
 	
 	//do they need to be merged?
-	int registerConvexHullShape(class btConvexUtility* convexPtr, btCollidable& col);
+	
 	int	registerConvexHullShape(btConvexUtility* utilPtr);
 	int	registerConvexHullShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
 
-	int registerConvexHeightfield(class ConvexHeightField* convexShape,btCollidable& col);
 	int registerRigidBody(int collidableIndex, float mass, const float* position, const float* orientation, const float* aabbMin, const float* aabbMax,bool writeToGpu);
 	void setObjectTransform(const float* position, const float* orientation , int bodyIndex);
 
diff --git a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
index 704942d23..f73079a9b 100644
--- a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
+++ b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
@@ -78,10 +78,10 @@ void	btGpuRigidBodyPipeline::stepSimulation(float deltaTime)
 	if (numPairs)
 	{
 		cl_mem pairs = m_data->m_broadphaseSap->getOverlappingPairBuffer();
-		cl_mem aabbs = m_data->m_broadphaseSap->getAabbBuffer();
+		cl_mem aabbsWS = m_data->m_broadphaseSap->getAabbBufferWS();
 		
 
-		m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbs,numBodies);
+		m_data->m_narrowphase->computeContacts(pairs,numPairs,aabbsWS,numBodies);
 		numContacts = m_data->m_narrowphase->getNumContactsGpu();
 		//if (numContacts)
 		//	printf("numContacts = %d\n", numContacts);
@@ -169,7 +169,7 @@ void	btGpuRigidBodyPipeline::setupGpuAabbsFull()
 	launcher.setBuffer(collidables);
 	cl_mem localAabbs = m_data->m_narrowphase->getAabbBufferGpu();
 	launcher.setBuffer(localAabbs);
-	cl_mem worldAabbs = m_data->m_broadphaseSap->getAabbBuffer();
+	cl_mem worldAabbs = m_data->m_broadphaseSap->getAabbBufferWS();
 	launcher.setBuffer(worldAabbs);
 	launcher.launch1D(numBodies);
 	oclCHECKERROR(ciErrNum, CL_SUCCESS);
diff --git a/opencl/gpu_sat/host/ConvexHullContact.cpp b/opencl/gpu_sat/host/ConvexHullContact.cpp
index 0987e3486..d527a9c36 100644
--- a/opencl/gpu_sat/host/ConvexHullContact.cpp
+++ b/opencl/gpu_sat/host/ConvexHullContact.cpp
@@ -23,6 +23,7 @@ subject to the following restrictions:
 #include "ConvexHullContact.h"
 #include <string.h>//memcpy
 #include "btConvexPolyhedronCL.h"
+#include "btOptimizedBvh.h"
 
 typedef btAlignedObjectArray<btVector3> btVertexArray;
 #include "BulletCommon/btQuickprof.h"
@@ -34,8 +35,11 @@ typedef btAlignedObjectArray<btVector3> btVertexArray;
 
 #include "../kernels/satKernels.h"
 #include "../kernels/satClipHullContacts.h"
+#include "../kernels/bvhTraversal.h"
+
 #include "BulletGeometry/btAabbUtil2.h"
 
+
 #define dot3F4 btDot
 
 GpuSatCollision::GpuSatCollision(cl_context ctx,cl_device_id device, cl_command_queue  q )
@@ -92,11 +96,8 @@ m_totalContactsOut(m_context, m_queue)
         m_newContactReductionKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcClip,
                             "newContactReductionKernel",&errNum,satClipContactsProg);
 		btAssert(errNum==CL_SUCCESS);
-
-        
-        
-
-	} else
+	}
+   else
 	{
 		m_clipHullHullKernel=0;
 		m_clipCompoundsHullHullKernel = 0;
@@ -106,12 +107,25 @@ m_totalContactsOut(m_context, m_queue)
 		m_clipHullHullConcaveConvexKernel = 0;
 		m_extractManifoldAndAddContactKernel = 0;
 	}
+
+	 if (1)
+	{
+		const char* srcBvh = bvhTraversalKernelCL;
+		cl_program bvhTraversalProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,srcBvh,&errNum,"","opencl/gpu_sat/kernels/bvhTraversal.cl");
+		btAssert(errNum==CL_SUCCESS);
+
+		m_bvhTraversalKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,srcBvh, "bvhTraversalKernel",&errNum,bvhTraversalProg);
+		btAssert(errNum==CL_SUCCESS);
+
+	}
+        
 	
 
 }
 
 GpuSatCollision::~GpuSatCollision()
 {
+	
 	if (m_findSeparatingAxisKernel)
 		clReleaseKernel(m_findSeparatingAxisKernel);
 
@@ -138,9 +152,23 @@ GpuSatCollision::~GpuSatCollision()
 		clReleaseKernel(m_clipHullHullConcaveConvexKernel);
 	if (m_extractManifoldAndAddContactKernel)
 		clReleaseKernel(m_extractManifoldAndAddContactKernel);
+
+	if (m_bvhTraversalKernel)
+		clReleaseKernel(m_bvhTraversalKernel);
+
 }
 
+struct MyTriangleCallback : public btNodeOverlapCallback
+{
+	int m_bodyIndexA;
+	int m_bodyIndexB;
 
+	virtual void processNode(int subPart, int triangleIndex)
+	{
+		printf("bodyIndexA %d, bodyIndexB %d\n",m_bodyIndexA,m_bodyIndexB);
+		printf("triangleIndex %d\n", triangleIndex);
+	}
+};
 
 void GpuSatCollision::computeConvexConvexContactsGPUSAT( const btOpenCLArray<btInt2>* pairs, int nPairs,
 			const btOpenCLArray<btRigidBodyCL>* bodyBuf,
@@ -154,12 +182,13 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( const btOpenCLArray<btI
 			const btOpenCLArray<btCollidable>& gpuCollidables,
 			const btOpenCLArray<btGpuChildShape>& gpuChildShapes,
 
-			const btOpenCLArray<btYetAnotherAabb>& clAabbs,
+			const btOpenCLArray<btYetAnotherAabb>& clAabbsWS,
             btOpenCLArray<btVector3>& worldVertsB1GPU,
             btOpenCLArray<btInt4>& clippingFacesOutGPU,
             btOpenCLArray<btVector3>& worldNormalsAGPU,
             btOpenCLArray<btVector3>& worldVertsA1GPU,
-            btOpenCLArray<btVector3>& worldVertsB2GPU,     
+            btOpenCLArray<btVector3>& worldVertsB2GPU,    
+			btAlignedObjectArray<class btOptimizedBvh*>& bvhData,
 			int numObjects,
 			int maxTriConvexPairCapacity,
 			btOpenCLArray<btInt4>& triangleConvexPairsOut,
@@ -219,7 +248,7 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( const btOpenCLArray<btI
 				btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
 				btBufferInfoCL( gpuFaces.getBufferCL(),true),
 				btBufferInfoCL( gpuIndices.getBufferCL(),true),
-				btBufferInfoCL( clAabbs.getBufferCL(),true),
+				btBufferInfoCL( clAabbsWS.getBufferCL(),true),
 				btBufferInfoCL( sepNormals.getBufferCL()),
 				btBufferInfoCL( hasSeparatingNormals.getBufferCL()),
 				btBufferInfoCL( triangleConvexPairsOut.getBufferCL()),
@@ -237,9 +266,119 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( const btOpenCLArray<btI
 			clFinish(m_queue);
 
 			numConcave = numConcavePairsOut.at(0);
-			if (numConcave > maxTriConvexPairCapacity)
-				numConcave = maxTriConvexPairCapacity;
-			triangleConvexPairsOut.resize(numConcave);
+			if (numConcave)
+			{
+				if (numConcave > maxTriConvexPairCapacity)
+					numConcave = maxTriConvexPairCapacity;
+
+				triangleConvexPairsOut.resize(numConcave);
+				btAlignedObjectArray<btInt4> triangleConvexPairsOutCPU;
+				triangleConvexPairsOut.copyToHost(triangleConvexPairsOutCPU);
+				printf("-----------------------\n", numConcave);
+				printf("got %d concave pairs\n", numConcave);
+				btAssert(numConcave = triangleConvexPairsOutCPU.size());
+
+				for (int i=0;i<triangleConvexPairsOutCPU.size();i++)
+				{
+					printf("bodyIndexA = %d, bodyIndexB = %d\n", triangleConvexPairsOutCPU[i].x,triangleConvexPairsOutCPU[i].y);
+					printf("triangleIndex = %d\n", triangleConvexPairsOutCPU[i].z);
+				}
+				printf("-----------------------\n", numConcave);
+				printf("Now using BVH query\n" );
+				btAlignedObjectArray<btCollidable> collidablesCPU;
+				gpuCollidables.copyToHost(collidablesCPU);
+				btAlignedObjectArray<btRigidBodyCL> bodiesCPU;
+				bodyBuf->copyToHost(bodiesCPU);
+				btAlignedObjectArray<btInt2> pairsCPU;
+
+				btAlignedObjectArray<btYetAnotherAabb> aabbsWSCPU;
+				clAabbsWS.copyToHost(aabbsWSCPU);
+
+				pairs->copyToHost(pairsCPU);
+				MyTriangleCallback triCallback;
+				
+
+				for (int i=0;i<pairsCPU.size();i++)
+				{
+					int bodyIndexA = pairsCPU[i].x;
+					int bodyIndexB = pairsCPU[i].y;
+
+					triCallback.m_bodyIndexA = bodyIndexA;
+					triCallback.m_bodyIndexB = bodyIndexB;
+
+					int collidableIndexA = bodiesCPU[bodyIndexA].m_collidableIdx;
+					int collidableIndexB = bodiesCPU[bodyIndexB].m_collidableIdx;
+
+					if (collidablesCPU[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH)
+					{
+						//check aabbWS for bodyB against optimized BVH
+						btVector3 aabbMin = (const btVector3&)aabbsWSCPU[bodyIndexB].m_min[0];
+						aabbMin[3] = 0.f;
+						btVector3 aabbMax = (const btVector3&)aabbsWSCPU[bodyIndexB].m_max[0];
+						aabbMax[3] = 0.f;
+						bvhData[0]->reportAabbOverlappingNodex(&triCallback, aabbMin,aabbMax);
+					}
+				}
+
+				//now perform the tree query on GPU
+
+				int numNodes = bvhData[0]->getLeafNodeArray().size();
+				btOpenCLArray<btQuantizedBvhNode>	treeNodesGPU(this->m_context,this->m_queue,numNodes);
+				treeNodesGPU.copyFromHost(bvhData[0]->getQuantizedNodeArray());
+				int numSubTrees = bvhData[0]->getSubtreeInfoArray().size();
+				btOpenCLArray<btBvhSubtreeInfo>	subTreesGPU(this->m_context,this->m_queue,numSubTrees);
+				subTreesGPU.copyFromHost(bvhData[0]->getSubtreeInfoArray());
+
+
+				/*
+				__kernel void   bvhTraversalKernel( __global const int2* pairs, 
+													__global const BodyData* rigidBodies, 
+													__global const btCollidableGpu* collidables,
+													__global btAabbCL* aabbs,
+													__global int4* concavePairsOut,
+													__global volatile int* numConcavePairsOut,
+													int numPairs,
+													int maxNumConcavePairsCapacity
+													)
+
+																					btBufferInfoCL( pairs->getBufferCL(), true ), 
+				btBufferInfoCL( bodyBuf->getBufferCL(),true), 
+				btBufferInfoCL( gpuCollidables.getBufferCL(),true), 
+				btBufferInfoCL( convexData.getBufferCL(),true),
+				btBufferInfoCL( gpuVertices.getBufferCL(),true),
+				btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
+				btBufferInfoCL( gpuFaces.getBufferCL(),true),
+				btBufferInfoCL( gpuIndices.getBufferCL(),true),
+				btBufferInfoCL( clAabbsWS.getBufferCL(),true),
+				btBufferInfoCL( sepNormals.getBufferCL()),
+				btBufferInfoCL( hasSeparatingNormals.getBufferCL()),
+				btBufferInfoCL( triangleConvexPairsOut.getBufferCL()),
+				btBufferInfoCL( concaveSepNormals.getBufferCL()),
+				btBufferInfoCL( numConcavePairsOut.getBufferCL())
+
+				*/
+
+				{
+					int np = numConcavePairsOut.at(0);
+					printf("np=%d\n", np);
+					btLauncherCL launcher(m_queue, m_bvhTraversalKernel);
+					launcher.setBuffer( pairs->getBufferCL());
+					launcher.setBuffer(  bodyBuf->getBufferCL());
+					launcher.setBuffer( gpuCollidables.getBufferCL());
+					launcher.setBuffer( clAabbsWS.getBufferCL());
+					launcher.setBuffer( triangleConvexPairsOut.getBufferCL());
+					launcher.setBuffer( numConcavePairsOut.getBufferCL());
+					launcher.setConst( nPairs  );
+					launcher.setConst( maxTriConvexPairCapacity);
+					int num = nPairs;
+					launcher.launch1D( num);
+					clFinish(m_queue);
+					np = numConcavePairsOut.at(0);
+					printf("np=%d\n", np);
+
+				}
+				printf("-----------------------\n", numConcave);
+			}
 			
 
 
@@ -256,7 +395,7 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( const btOpenCLArray<btI
 					btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
 					btBufferInfoCL( gpuFaces.getBufferCL(),true),
 					btBufferInfoCL( gpuIndices.getBufferCL(),true),
-					btBufferInfoCL( clAabbs.getBufferCL(),true),
+					btBufferInfoCL( clAabbsWS.getBufferCL(),true),
 					btBufferInfoCL( gpuChildShapes.getBufferCL(),true),
 					btBufferInfoCL( gpuCompoundPairs.getBufferCL()),
 					btBufferInfoCL( numCompoundPairsOut.getBufferCL())
@@ -297,7 +436,7 @@ void GpuSatCollision::computeConvexConvexContactsGPUSAT( const btOpenCLArray<btI
 					btBufferInfoCL( gpuUniqueEdges.getBufferCL(),true),
 					btBufferInfoCL( gpuFaces.getBufferCL(),true),
 					btBufferInfoCL( gpuIndices.getBufferCL(),true),
-					btBufferInfoCL( clAabbs.getBufferCL(),true),
+					btBufferInfoCL( clAabbsWS.getBufferCL(),true),
 					btBufferInfoCL( gpuChildShapes.getBufferCL(),true),
 					btBufferInfoCL( gpuCompoundSepNormals.getBufferCL()),
 					btBufferInfoCL( gpuHasCompoundSepNormals.getBufferCL())
diff --git a/opencl/gpu_sat/host/ConvexHullContact.h b/opencl/gpu_sat/host/ConvexHullContact.h
index 68430a957..1d64d0095 100644
--- a/opencl/gpu_sat/host/ConvexHullContact.h
+++ b/opencl/gpu_sat/host/ConvexHullContact.h
@@ -48,6 +48,8 @@ struct GpuSatCollision
 	cl_kernel				m_clipHullHullConcaveConvexKernel;
 	cl_kernel				m_extractManifoldAndAddContactKernel;
     cl_kernel               m_newContactReductionKernel;
+
+	cl_kernel				m_bvhTraversalKernel;
     
 
 	btOpenCLArray<int>		m_totalContactsOut;
@@ -73,6 +75,7 @@ struct GpuSatCollision
            btOpenCLArray<btVector3>& worldNormalsAGPU,
            btOpenCLArray<btVector3>& worldVertsA1GPU,
            btOpenCLArray<btVector3>& worldVertsB2GPU,
+		   btAlignedObjectArray<class btOptimizedBvh*>& bvhData,
 			int numObjects,
 			int maxTriConvexPairCapacity,
 			btOpenCLArray<btInt4>& triangleConvexPairs,
diff --git a/opencl/gpu_sat/host/btOptimizedBvh.cpp b/opencl/gpu_sat/host/btOptimizedBvh.cpp
new file mode 100644
index 000000000..a8dff1691
--- /dev/null
+++ b/opencl/gpu_sat/host/btOptimizedBvh.cpp
@@ -0,0 +1,391 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "btOptimizedBvh.h"
+#include "btStridingMeshInterface.h"
+#include "BulletGeometry/btAabbUtil2.h"
+#include "BulletCommon/btIDebugDraw.h"
+
+
+btOptimizedBvh::btOptimizedBvh()
+{ 
+}
+
+btOptimizedBvh::~btOptimizedBvh()
+{
+}
+
+
+void btOptimizedBvh::build(btStridingMeshInterface* triangles, bool useQuantizedAabbCompression, const btVector3& bvhAabbMin, const btVector3& bvhAabbMax)
+{
+	m_useQuantization = useQuantizedAabbCompression;
+
+
+	// NodeArray	triangleNodes;
+
+	struct	NodeTriangleCallback : public btInternalTriangleIndexCallback
+	{
+
+		NodeArray&	m_triangleNodes;
+
+		NodeTriangleCallback& operator=(NodeTriangleCallback& other)
+		{
+			m_triangleNodes.copyFromArray(other.m_triangleNodes);
+			return *this;
+		}
+		
+		NodeTriangleCallback(NodeArray&	triangleNodes)
+			:m_triangleNodes(triangleNodes)
+		{
+		}
+
+		virtual void internalProcessTriangleIndex(btVector3* triangle,int partId,int  triangleIndex)
+		{
+			btOptimizedBvhNode node;
+			btVector3	aabbMin,aabbMax;
+			aabbMin.setValue(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+			aabbMax.setValue(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT)); 
+			aabbMin.setMin(triangle[0]);
+			aabbMax.setMax(triangle[0]);
+			aabbMin.setMin(triangle[1]);
+			aabbMax.setMax(triangle[1]);
+			aabbMin.setMin(triangle[2]);
+			aabbMax.setMax(triangle[2]);
+
+			//with quantization?
+			node.m_aabbMinOrg = aabbMin;
+			node.m_aabbMaxOrg = aabbMax;
+
+			node.m_escapeIndex = -1;
+	
+			//for child nodes
+			node.m_subPart = partId;
+			node.m_triangleIndex = triangleIndex;
+			m_triangleNodes.push_back(node);
+		}
+	};
+	struct	QuantizedNodeTriangleCallback : public btInternalTriangleIndexCallback
+	{
+		QuantizedNodeArray&	m_triangleNodes;
+		const btQuantizedBvh* m_optimizedTree; // for quantization
+
+		QuantizedNodeTriangleCallback& operator=(QuantizedNodeTriangleCallback& other)
+		{
+			m_triangleNodes.copyFromArray(other.m_triangleNodes);
+			m_optimizedTree = other.m_optimizedTree;
+			return *this;
+		}
+
+		QuantizedNodeTriangleCallback(QuantizedNodeArray&	triangleNodes,const btQuantizedBvh* tree)
+			:m_triangleNodes(triangleNodes),m_optimizedTree(tree)
+		{
+		}
+
+		virtual void internalProcessTriangleIndex(btVector3* triangle,int partId,int  triangleIndex)
+		{
+			// The partId and triangle index must fit in the same (positive) integer
+			btAssert(partId < (1<<MAX_NUM_PARTS_IN_BITS));
+			btAssert(triangleIndex < (1<<(31-MAX_NUM_PARTS_IN_BITS)));
+			//negative indices are reserved for escapeIndex
+			btAssert(triangleIndex>=0);
+
+			btQuantizedBvhNode node;
+			btVector3	aabbMin,aabbMax;
+			aabbMin.setValue(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+			aabbMax.setValue(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT)); 
+			aabbMin.setMin(triangle[0]);
+			aabbMax.setMax(triangle[0]);
+			aabbMin.setMin(triangle[1]);
+			aabbMax.setMax(triangle[1]);
+			aabbMin.setMin(triangle[2]);
+			aabbMax.setMax(triangle[2]);
+
+			//PCK: add these checks for zero dimensions of aabb
+			const btScalar MIN_AABB_DIMENSION = btScalar(0.002);
+			const btScalar MIN_AABB_HALF_DIMENSION = btScalar(0.001);
+			if (aabbMax.x() - aabbMin.x() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setX(aabbMax.x() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setX(aabbMin.x() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.y() - aabbMin.y() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setY(aabbMax.y() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setY(aabbMin.y() - MIN_AABB_HALF_DIMENSION);
+			}
+			if (aabbMax.z() - aabbMin.z() < MIN_AABB_DIMENSION)
+			{
+				aabbMax.setZ(aabbMax.z() + MIN_AABB_HALF_DIMENSION);
+				aabbMin.setZ(aabbMin.z() - MIN_AABB_HALF_DIMENSION);
+			}
+
+			m_optimizedTree->quantize(&node.m_quantizedAabbMin[0],aabbMin,0);
+			m_optimizedTree->quantize(&node.m_quantizedAabbMax[0],aabbMax,1);
+
+			node.m_escapeIndexOrTriangleIndex = (partId<<(31-MAX_NUM_PARTS_IN_BITS)) | triangleIndex;
+
+			m_triangleNodes.push_back(node);
+		}
+	};
+	
+
+
+	int numLeafNodes = 0;
+
+	
+	if (m_useQuantization)
+	{
+
+		//initialize quantization values
+		setQuantizationValues(bvhAabbMin,bvhAabbMax);
+
+		QuantizedNodeTriangleCallback	callback(m_quantizedLeafNodes,this);
+
+	
+		triangles->InternalProcessAllTriangles(&callback,m_bvhAabbMin,m_bvhAabbMax);
+
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_quantizedLeafNodes.size();
+
+
+		m_quantizedContiguousNodes.resize(2*numLeafNodes);
+
+
+	} else
+	{
+		NodeTriangleCallback	callback(m_leafNodes);
+
+		btVector3 aabbMin(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT));
+		btVector3 aabbMax(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+
+		triangles->InternalProcessAllTriangles(&callback,aabbMin,aabbMax);
+
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_leafNodes.size();
+
+		m_contiguousNodes.resize(2*numLeafNodes);
+	}
+
+	m_curNodeIndex = 0;
+
+	buildTree(0,numLeafNodes);
+
+	///if the entire tree is small then subtree size, we need to create a header info for the tree
+	if(m_useQuantization && !m_SubtreeHeaders.size())
+	{
+		btBvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
+		subtree.m_rootNodeIndex = 0;
+		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
+	}
+
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
+	m_quantizedLeafNodes.clear();
+	m_leafNodes.clear();
+}
+
+
+
+
+void	btOptimizedBvh::refit(btStridingMeshInterface* meshInterface,const btVector3& aabbMin,const btVector3& aabbMax)
+{
+	if (m_useQuantization)
+	{
+
+		setQuantizationValues(aabbMin,aabbMax);
+
+		updateBvhNodes(meshInterface,0,m_curNodeIndex,0);
+
+		///now update all subtree headers
+
+		int i;
+		for (i=0;i<m_SubtreeHeaders.size();i++)
+		{
+			btBvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
+		}
+
+	} else
+	{
+
+	}
+}
+
+
+
+
+void	btOptimizedBvh::refitPartial(btStridingMeshInterface* meshInterface,const btVector3& aabbMin,const btVector3& aabbMax)
+{
+	//incrementally initialize quantization values
+	btAssert(m_useQuantization);
+
+	btAssert(aabbMin.getX() > m_bvhAabbMin.getX());
+	btAssert(aabbMin.getY() > m_bvhAabbMin.getY());
+	btAssert(aabbMin.getZ() > m_bvhAabbMin.getZ());
+
+	btAssert(aabbMax.getX() < m_bvhAabbMax.getX());
+	btAssert(aabbMax.getY() < m_bvhAabbMax.getY());
+	btAssert(aabbMax.getZ() < m_bvhAabbMax.getZ());
+
+	///we should update all quantization values, using updateBvhNodes(meshInterface);
+	///but we only update chunks that overlap the given aabb
+	
+	unsigned short	quantizedQueryAabbMin[3];
+	unsigned short	quantizedQueryAabbMax[3];
+
+	quantize(&quantizedQueryAabbMin[0],aabbMin,0);
+	quantize(&quantizedQueryAabbMax[0],aabbMax,1);
+
+	int i;
+	for (i=0;i<this->m_SubtreeHeaders.size();i++)
+	{
+		btBvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+
+		//PCK: unsigned instead of bool
+		unsigned overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			updateBvhNodes(meshInterface,subtree.m_rootNodeIndex,subtree.m_rootNodeIndex+subtree.m_subtreeSize,i);
+
+			subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[subtree.m_rootNodeIndex]);
+		}
+	}
+	
+}
+
+void	btOptimizedBvh::updateBvhNodes(btStridingMeshInterface* meshInterface,int firstNode,int endNode,int index)
+{
+	(void)index;
+
+	btAssert(m_useQuantization);
+
+	int curNodeSubPart=-1;
+
+	//get access info to trianglemesh data
+		const unsigned char *vertexbase = 0;
+		int numverts = 0;
+		PHY_ScalarType type = PHY_INTEGER;
+		int stride = 0;
+		const unsigned char *indexbase = 0;
+		int indexstride = 0;
+		int numfaces = 0;
+		PHY_ScalarType indicestype = PHY_INTEGER;
+
+		btVector3	triangleVerts[3];
+		btVector3	aabbMin,aabbMax;
+		const btVector3& meshScaling = meshInterface->getScaling();
+		
+		int i;
+		for (i=endNode-1;i>=firstNode;i--)
+		{
+
+
+			btQuantizedBvhNode& curNode = m_quantizedContiguousNodes[i];
+			if (curNode.isLeafNode())
+			{
+				//recalc aabb from triangle data
+				int nodeSubPart = curNode.getPartId();
+				int nodeTriangleIndex = curNode.getTriangleIndex();
+				if (nodeSubPart != curNodeSubPart)
+				{
+					if (curNodeSubPart >= 0)
+						meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+					meshInterface->getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,	type,stride,&indexbase,indexstride,numfaces,indicestype,nodeSubPart);
+
+					curNodeSubPart = nodeSubPart;
+					btAssert(indicestype==PHY_INTEGER||indicestype==PHY_SHORT);
+				}
+				//triangles->getLockedReadOnlyVertexIndexBase(vertexBase,numVerts,
+
+				unsigned int* gfxbase = (unsigned int*)(indexbase+nodeTriangleIndex*indexstride);
+				
+				
+				for (int j=2;j>=0;j--)
+				{
+					
+					int graphicsindex = indicestype==PHY_SHORT?((unsigned short*)gfxbase)[j]:gfxbase[j];
+					if (type == PHY_FLOAT)
+					{
+						float* graphicsbase = (float*)(vertexbase+graphicsindex*stride);
+						triangleVerts[j] = btVector3(
+							graphicsbase[0]*meshScaling.getX(),
+							graphicsbase[1]*meshScaling.getY(),
+							graphicsbase[2]*meshScaling.getZ());
+					}
+					else
+					{
+						double* graphicsbase = (double*)(vertexbase+graphicsindex*stride);
+						triangleVerts[j] = btVector3( btScalar(graphicsbase[0]*meshScaling.getX()), btScalar(graphicsbase[1]*meshScaling.getY()), btScalar(graphicsbase[2]*meshScaling.getZ()));
+					}
+				}
+
+
+				
+				aabbMin.setValue(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+				aabbMax.setValue(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT)); 
+				aabbMin.setMin(triangleVerts[0]);
+				aabbMax.setMax(triangleVerts[0]);
+				aabbMin.setMin(triangleVerts[1]);
+				aabbMax.setMax(triangleVerts[1]);
+				aabbMin.setMin(triangleVerts[2]);
+				aabbMax.setMax(triangleVerts[2]);
+
+				quantize(&curNode.m_quantizedAabbMin[0],aabbMin,0);
+				quantize(&curNode.m_quantizedAabbMax[0],aabbMax,1);
+				
+			} else
+			{
+				//combine aabb from both children
+
+				btQuantizedBvhNode* leftChildNode = &m_quantizedContiguousNodes[i+1];
+				
+				btQuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? &m_quantizedContiguousNodes[i+2] :
+					&m_quantizedContiguousNodes[i+1+leftChildNode->getEscapeIndex()];
+				
+
+				{
+					for (int i=0;i<3;i++)
+					{
+						curNode.m_quantizedAabbMin[i] = leftChildNode->m_quantizedAabbMin[i];
+						if (curNode.m_quantizedAabbMin[i]>rightChildNode->m_quantizedAabbMin[i])
+							curNode.m_quantizedAabbMin[i]=rightChildNode->m_quantizedAabbMin[i];
+
+						curNode.m_quantizedAabbMax[i] = leftChildNode->m_quantizedAabbMax[i];
+						if (curNode.m_quantizedAabbMax[i] < rightChildNode->m_quantizedAabbMax[i])
+							curNode.m_quantizedAabbMax[i] = rightChildNode->m_quantizedAabbMax[i];
+					}
+				}
+			}
+
+		}
+
+		if (curNodeSubPart >= 0)
+			meshInterface->unLockReadOnlyVertexBase(curNodeSubPart);
+
+		
+}
+
+///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+btOptimizedBvh* btOptimizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
+{
+	btQuantizedBvh* bvh = btQuantizedBvh::deSerializeInPlace(i_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
+	
+	//we don't add additional data so just do a static upcast
+	return static_cast<btOptimizedBvh*>(bvh);
+}
diff --git a/opencl/gpu_sat/host/btOptimizedBvh.h b/opencl/gpu_sat/host/btOptimizedBvh.h
new file mode 100644
index 000000000..e6692b456
--- /dev/null
+++ b/opencl/gpu_sat/host/btOptimizedBvh.h
@@ -0,0 +1,65 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///Contains contributions from Disney Studio's
+
+#ifndef BT_OPTIMIZED_BVH_H
+#define BT_OPTIMIZED_BVH_H
+
+#include "btQuantizedBvh.h"
+
+class btStridingMeshInterface;
+
+
+///The btOptimizedBvh extends the btQuantizedBvh to create AABB tree for triangle meshes, through the btStridingMeshInterface.
+ATTRIBUTE_ALIGNED16(class) btOptimizedBvh : public btQuantizedBvh
+{
+	
+public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+protected:
+
+public:
+
+	btOptimizedBvh();
+
+	virtual ~btOptimizedBvh();
+
+	void	build(btStridingMeshInterface* triangles,bool useQuantizedAabbCompression, const btVector3& bvhAabbMin, const btVector3& bvhAabbMax);
+
+	void	refit(btStridingMeshInterface* triangles,const btVector3& aabbMin,const btVector3& aabbMax);
+
+	void	refitPartial(btStridingMeshInterface* triangles,const btVector3& aabbMin, const btVector3& aabbMax);
+
+	void	updateBvhNodes(btStridingMeshInterface* meshInterface,int firstNode,int endNode,int index);
+
+	/// Data buffer MUST be 16 byte aligned
+	virtual bool serializeInPlace(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const
+	{
+		return btQuantizedBvh::serialize(o_alignedDataBuffer,i_dataBufferSize,i_swapEndian);
+
+	}
+
+	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+	static btOptimizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
+
+
+};
+
+
+#endif //BT_OPTIMIZED_BVH_H
+
+
diff --git a/opencl/gpu_sat/host/btQuantizedBvh.cpp b/opencl/gpu_sat/host/btQuantizedBvh.cpp
new file mode 100644
index 000000000..3c10d5bcf
--- /dev/null
+++ b/opencl/gpu_sat/host/btQuantizedBvh.cpp
@@ -0,0 +1,1302 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btQuantizedBvh.h"
+
+#include "BulletGeometry/btAabbUtil2.h"
+#include "BulletCommon/btIDebugDraw.h"
+
+
+#define RAYAABB2
+
+btQuantizedBvh::btQuantizedBvh() : 
+					m_bulletVersion(BT_BULLET_VERSION),
+					m_useQuantization(false), 
+					m_traversalMode(TRAVERSAL_STACKLESS_CACHE_FRIENDLY)
+					//m_traversalMode(TRAVERSAL_STACKLESS)
+					//m_traversalMode(TRAVERSAL_RECURSIVE)
+					,m_subtreeHeaderCount(0) //PCK: add this line
+{
+	m_bvhAabbMin.setValue(-SIMD_INFINITY,-SIMD_INFINITY,-SIMD_INFINITY);
+	m_bvhAabbMax.setValue(SIMD_INFINITY,SIMD_INFINITY,SIMD_INFINITY);
+}
+
+
+
+
+
+void btQuantizedBvh::buildInternal()
+{
+	///assumes that caller filled in the m_quantizedLeafNodes
+	m_useQuantization = true;
+	int numLeafNodes = 0;
+	
+	if (m_useQuantization)
+	{
+		//now we have an array of leafnodes in m_leafNodes
+		numLeafNodes = m_quantizedLeafNodes.size();
+
+		m_quantizedContiguousNodes.resize(2*numLeafNodes);
+
+	}
+
+	m_curNodeIndex = 0;
+
+	buildTree(0,numLeafNodes);
+
+	///if the entire tree is small then subtree size, we need to create a header info for the tree
+	if(m_useQuantization && !m_SubtreeHeaders.size())
+	{
+		btBvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(m_quantizedContiguousNodes[0]);
+		subtree.m_rootNodeIndex = 0;
+		subtree.m_subtreeSize = m_quantizedContiguousNodes[0].isLeafNode() ? 1 : m_quantizedContiguousNodes[0].getEscapeIndex();
+	}
+
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+	//PCK: clear m_quantizedLeafNodes and m_leafNodes, they are temporary
+	m_quantizedLeafNodes.clear();
+	m_leafNodes.clear();
+}
+
+
+
+///just for debugging, to visualize the individual patches/subtrees
+#ifdef DEBUG_PATCH_COLORS
+btVector3 color[4]=
+{
+	btVector3(1,0,0),
+	btVector3(0,1,0),
+	btVector3(0,0,1),
+	btVector3(0,1,1)
+};
+#endif //DEBUG_PATCH_COLORS
+
+
+
+void	btQuantizedBvh::setQuantizationValues(const btVector3& bvhAabbMin,const btVector3& bvhAabbMax,btScalar quantizationMargin)
+{
+	//enlarge the AABB to avoid division by zero when initializing the quantization values
+	btVector3 clampValue(quantizationMargin,quantizationMargin,quantizationMargin);
+	m_bvhAabbMin = bvhAabbMin - clampValue;
+	m_bvhAabbMax = bvhAabbMax + clampValue;
+	btVector3 aabbSize = m_bvhAabbMax - m_bvhAabbMin;
+	m_bvhQuantization = btVector3(btScalar(65533.0),btScalar(65533.0),btScalar(65533.0)) / aabbSize;
+	m_useQuantization = true;
+}
+
+
+
+
+btQuantizedBvh::~btQuantizedBvh()
+{
+}
+
+#ifdef DEBUG_TREE_BUILDING
+int gStackDepth = 0;
+int gMaxStackDepth = 0;
+#endif //DEBUG_TREE_BUILDING
+
+void	btQuantizedBvh::buildTree	(int startIndex,int endIndex)
+{
+#ifdef DEBUG_TREE_BUILDING
+	gStackDepth++;
+	if (gStackDepth > gMaxStackDepth)
+		gMaxStackDepth = gStackDepth;
+#endif //DEBUG_TREE_BUILDING
+
+
+	int splitAxis, splitIndex, i;
+	int numIndices =endIndex-startIndex;
+	int curIndex = m_curNodeIndex;
+
+	btAssert(numIndices>0);
+
+	if (numIndices==1)
+	{
+#ifdef DEBUG_TREE_BUILDING
+		gStackDepth--;
+#endif //DEBUG_TREE_BUILDING
+		
+		assignInternalNodeFromLeafNode(m_curNodeIndex,startIndex);
+
+		m_curNodeIndex++;
+		return;	
+	}
+	//calculate Best Splitting Axis and where to split it. Sort the incoming 'leafNodes' array within range 'startIndex/endIndex'.
+	
+	splitAxis = calcSplittingAxis(startIndex,endIndex);
+
+	splitIndex = sortAndCalcSplittingIndex(startIndex,endIndex,splitAxis);
+
+	int internalNodeIndex = m_curNodeIndex;
+	
+	//set the min aabb to 'inf' or a max value, and set the max aabb to a -inf/minimum value.
+	//the aabb will be expanded during buildTree/mergeInternalNodeAabb with actual node values
+	setInternalNodeAabbMin(m_curNodeIndex,m_bvhAabbMax);//can't use btVector3(SIMD_INFINITY,SIMD_INFINITY,SIMD_INFINITY)) because of quantization
+	setInternalNodeAabbMax(m_curNodeIndex,m_bvhAabbMin);//can't use btVector3(-SIMD_INFINITY,-SIMD_INFINITY,-SIMD_INFINITY)) because of quantization
+	
+	
+	for (i=startIndex;i<endIndex;i++)
+	{
+		mergeInternalNodeAabb(m_curNodeIndex,getAabbMin(i),getAabbMax(i));
+	}
+
+	m_curNodeIndex++;
+	
+
+	//internalNode->m_escapeIndex;
+	
+	int leftChildNodexIndex = m_curNodeIndex;
+
+	//build left child tree
+	buildTree(startIndex,splitIndex);
+
+	int rightChildNodexIndex = m_curNodeIndex;
+	//build right child tree
+	buildTree(splitIndex,endIndex);
+
+#ifdef DEBUG_TREE_BUILDING
+	gStackDepth--;
+#endif //DEBUG_TREE_BUILDING
+
+	int escapeIndex = m_curNodeIndex - curIndex;
+
+	if (m_useQuantization)
+	{
+		//escapeIndex is the number of nodes of this subtree
+		const int sizeQuantizedNode =sizeof(btQuantizedBvhNode);
+		const int treeSizeInBytes = escapeIndex * sizeQuantizedNode;
+		if (treeSizeInBytes > MAX_SUBTREE_SIZE_IN_BYTES)
+		{
+			updateSubtreeHeaders(leftChildNodexIndex,rightChildNodexIndex);
+		}
+	} else
+	{
+
+	}
+
+	setInternalNodeEscapeIndex(internalNodeIndex,escapeIndex);
+
+}
+
+void	btQuantizedBvh::updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex)
+{
+	btAssert(m_useQuantization);
+
+	btQuantizedBvhNode& leftChildNode = m_quantizedContiguousNodes[leftChildNodexIndex];
+	int leftSubTreeSize = leftChildNode.isLeafNode() ? 1 : leftChildNode.getEscapeIndex();
+	int leftSubTreeSizeInBytes =  leftSubTreeSize * static_cast<int>(sizeof(btQuantizedBvhNode));
+	
+	btQuantizedBvhNode& rightChildNode = m_quantizedContiguousNodes[rightChildNodexIndex];
+	int rightSubTreeSize = rightChildNode.isLeafNode() ? 1 : rightChildNode.getEscapeIndex();
+	int rightSubTreeSizeInBytes =  rightSubTreeSize *  static_cast<int>(sizeof(btQuantizedBvhNode));
+
+	if(leftSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES)
+	{
+		btBvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(leftChildNode);
+		subtree.m_rootNodeIndex = leftChildNodexIndex;
+		subtree.m_subtreeSize = leftSubTreeSize;
+	}
+
+	if(rightSubTreeSizeInBytes <= MAX_SUBTREE_SIZE_IN_BYTES)
+	{
+		btBvhSubtreeInfo& subtree = m_SubtreeHeaders.expand();
+		subtree.setAabbFromQuantizeNode(rightChildNode);
+		subtree.m_rootNodeIndex = rightChildNodexIndex;
+		subtree.m_subtreeSize = rightSubTreeSize;
+	}
+
+	//PCK: update the copy of the size
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+}
+
+
+int	btQuantizedBvh::sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis)
+{
+	int i;
+	int splitIndex =startIndex;
+	int numIndices = endIndex - startIndex;
+	btScalar splitValue;
+
+	btVector3 means(btScalar(0.),btScalar(0.),btScalar(0.));
+	for (i=startIndex;i<endIndex;i++)
+	{
+		btVector3 center = btScalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		means+=center;
+	}
+	means *= (btScalar(1.)/(btScalar)numIndices);
+	
+	splitValue = means[splitAxis];
+	
+	//sort leafNodes so all values larger then splitValue comes first, and smaller values start from 'splitIndex'.
+	for (i=startIndex;i<endIndex;i++)
+	{
+		btVector3 center = btScalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		if (center[splitAxis] > splitValue)
+		{
+			//swap
+			swapLeafNodes(i,splitIndex);
+			splitIndex++;
+		}
+	}
+
+	//if the splitIndex causes unbalanced trees, fix this by using the center in between startIndex and endIndex
+	//otherwise the tree-building might fail due to stack-overflows in certain cases.
+	//unbalanced1 is unsafe: it can cause stack overflows
+	//bool unbalanced1 = ((splitIndex==startIndex) || (splitIndex == (endIndex-1)));
+
+	//unbalanced2 should work too: always use center (perfect balanced trees)	
+	//bool unbalanced2 = true;
+
+	//this should be safe too:
+	int rangeBalancedIndices = numIndices/3;
+	bool unbalanced = ((splitIndex<=(startIndex+rangeBalancedIndices)) || (splitIndex >=(endIndex-1-rangeBalancedIndices)));
+	
+	if (unbalanced)
+	{
+		splitIndex = startIndex+ (numIndices>>1);
+	}
+
+	bool unbal = (splitIndex==startIndex) || (splitIndex == (endIndex));
+	(void)unbal;
+	btAssert(!unbal);
+
+	return splitIndex;
+}
+
+
+int	btQuantizedBvh::calcSplittingAxis(int startIndex,int endIndex)
+{
+	int i;
+
+	btVector3 means(btScalar(0.),btScalar(0.),btScalar(0.));
+	btVector3 variance(btScalar(0.),btScalar(0.),btScalar(0.));
+	int numIndices = endIndex-startIndex;
+
+	for (i=startIndex;i<endIndex;i++)
+	{
+		btVector3 center = btScalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		means+=center;
+	}
+	means *= (btScalar(1.)/(btScalar)numIndices);
+		
+	for (i=startIndex;i<endIndex;i++)
+	{
+		btVector3 center = btScalar(0.5)*(getAabbMax(i)+getAabbMin(i));
+		btVector3 diff2 = center-means;
+		diff2 = diff2 * diff2;
+		variance += diff2;
+	}
+	variance *= (btScalar(1.)/	((btScalar)numIndices-1)	);
+	
+	return variance.maxAxis();
+}
+
+
+
+void	btQuantizedBvh::reportAabbOverlappingNodex(btNodeOverlapCallback* nodeCallback,const btVector3& aabbMin,const btVector3& aabbMax) const
+{
+	//either choose recursive traversal (walkTree) or stackless (walkStacklessTree)
+
+	if (m_useQuantization)
+	{
+		///quantize query AABB
+		unsigned short int quantizedQueryAabbMin[3];
+		unsigned short int quantizedQueryAabbMax[3];
+		quantizeWithClamp(quantizedQueryAabbMin,aabbMin,0);
+		quantizeWithClamp(quantizedQueryAabbMax,aabbMax,1);
+
+		switch (m_traversalMode)
+		{
+		case TRAVERSAL_STACKLESS:
+				walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,0,m_curNodeIndex);
+			break;
+		case TRAVERSAL_STACKLESS_CACHE_FRIENDLY:
+				walkStacklessQuantizedTreeCacheFriendly(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+			break;
+		case TRAVERSAL_RECURSIVE:
+			{
+				const btQuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[0];
+				walkRecursiveQuantizedTreeAgainstQueryAabb(rootNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+			}
+			break;
+		default:
+			//unsupported
+			btAssert(0);
+		}
+	} else
+	{
+		walkStacklessTree(nodeCallback,aabbMin,aabbMax);
+	}
+}
+
+
+int maxIterations = 0;
+
+
+void	btQuantizedBvh::walkStacklessTree(btNodeOverlapCallback* nodeCallback,const btVector3& aabbMin,const btVector3& aabbMax) const
+{
+	btAssert(!m_useQuantization);
+
+	const btOptimizedBvhNode* rootNode = &m_contiguousNodes[0];
+	int escapeIndex, curIndex = 0;
+	int walkIterations = 0;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+
+	while (curIndex < m_curNodeIndex)
+	{
+		//catch bugs in tree data
+		btAssert (walkIterations < m_curNodeIndex);
+
+		walkIterations++;
+		aabbOverlap = TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg);
+		isLeafNode = rootNode->m_escapeIndex == -1;
+		
+		//PCK: unsigned instead of bool
+		if (isLeafNode && (aabbOverlap != 0))
+		{
+			nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex);
+		} 
+		
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->m_escapeIndex;
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (maxIterations < walkIterations)
+		maxIterations = walkIterations;
+
+}
+
+/*
+///this was the original recursive traversal, before we optimized towards stackless traversal
+void	btQuantizedBvh::walkTree(btOptimizedBvhNode* rootNode,btNodeOverlapCallback* nodeCallback,const btVector3& aabbMin,const btVector3& aabbMax) const
+{
+	bool isLeafNode, aabbOverlap = TestAabbAgainstAabb2(aabbMin,aabbMax,rootNode->m_aabbMin,rootNode->m_aabbMax);
+	if (aabbOverlap)
+	{
+		isLeafNode = (!rootNode->m_leftChild && !rootNode->m_rightChild);
+		if (isLeafNode)
+		{
+			nodeCallback->processNode(rootNode);
+		} else
+		{
+			walkTree(rootNode->m_leftChild,nodeCallback,aabbMin,aabbMax);
+			walkTree(rootNode->m_rightChild,nodeCallback,aabbMin,aabbMax);
+		}
+	}
+
+}
+*/
+
+void btQuantizedBvh::walkRecursiveQuantizedTreeAgainstQueryAabb(const btQuantizedBvhNode* currentNode,btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const
+{
+	btAssert(m_useQuantization);
+	
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+
+	//PCK: unsigned instead of bool
+	aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,currentNode->m_quantizedAabbMin,currentNode->m_quantizedAabbMax);
+	isLeafNode = currentNode->isLeafNode();
+		
+	//PCK: unsigned instead of bool
+	if (aabbOverlap != 0)
+	{
+		if (isLeafNode)
+		{
+			nodeCallback->processNode(currentNode->getPartId(),currentNode->getTriangleIndex());
+		} else
+		{
+			//process left and right children
+			const btQuantizedBvhNode* leftChildNode = currentNode+1;
+			walkRecursiveQuantizedTreeAgainstQueryAabb(leftChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+
+			const btQuantizedBvhNode* rightChildNode = leftChildNode->isLeafNode() ? leftChildNode+1:leftChildNode+leftChildNode->getEscapeIndex();
+			walkRecursiveQuantizedTreeAgainstQueryAabb(rightChildNode,nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax);
+		}
+	}		
+}
+
+
+
+void	btQuantizedBvh::walkStacklessTreeAgainstRay(btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget, const btVector3& aabbMin, const btVector3& aabbMax, int startNodeIndex,int endNodeIndex) const
+{
+	btAssert(!m_useQuantization);
+
+	const btOptimizedBvhNode* rootNode = &m_contiguousNodes[0];
+	int escapeIndex, curIndex = 0;
+	int walkIterations = 0;
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap=0;
+	unsigned rayBoxOverlap=0;
+	btScalar lambda_max = 1.0;
+	
+		/* Quick pruning by quantized box */
+	btVector3 rayAabbMin = raySource;
+	btVector3 rayAabbMax = raySource;
+	rayAabbMin.setMin(rayTarget);
+	rayAabbMax.setMax(rayTarget);
+
+	/* Add box cast extents to bounding box */
+	rayAabbMin += aabbMin;
+	rayAabbMax += aabbMax;
+
+#ifdef RAYAABB2
+	btVector3 rayDir = (rayTarget-raySource);
+	rayDir.normalize ();
+	lambda_max = rayDir.dot(rayTarget-raySource);
+	///what about division by zero? --> just set rayDirection[i] to 1.0
+	btVector3 rayDirectionInverse;
+	rayDirectionInverse[0] = rayDir[0] == btScalar(0.0) ? btScalar(BT_LARGE_FLOAT) : btScalar(1.0) / rayDir[0];
+	rayDirectionInverse[1] = rayDir[1] == btScalar(0.0) ? btScalar(BT_LARGE_FLOAT) : btScalar(1.0) / rayDir[1];
+	rayDirectionInverse[2] = rayDir[2] == btScalar(0.0) ? btScalar(BT_LARGE_FLOAT) : btScalar(1.0) / rayDir[2];
+	unsigned int sign[3] = { rayDirectionInverse[0] < 0.0, rayDirectionInverse[1] < 0.0, rayDirectionInverse[2] < 0.0};
+#endif
+
+	btVector3 bounds[2];
+
+	while (curIndex < m_curNodeIndex)
+	{
+		btScalar param = 1.0;
+		//catch bugs in tree data
+		btAssert (walkIterations < m_curNodeIndex);
+
+		walkIterations++;
+
+		bounds[0] = rootNode->m_aabbMinOrg;
+		bounds[1] = rootNode->m_aabbMaxOrg;
+		/* Add box cast extents */
+		bounds[0] -= aabbMax;
+		bounds[1] -= aabbMin;
+
+		aabbOverlap = TestAabbAgainstAabb2(rayAabbMin,rayAabbMax,rootNode->m_aabbMinOrg,rootNode->m_aabbMaxOrg);
+		//perhaps profile if it is worth doing the aabbOverlap test first
+
+#ifdef RAYAABB2
+			///careful with this check: need to check division by zero (above) and fix the unQuantize method
+			///thanks Joerg/hiker for the reproduction case!
+			///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858
+		rayBoxOverlap = aabbOverlap ? btRayAabb2 (raySource, rayDirectionInverse, sign, bounds, param, 0.0f, lambda_max) : false;
+
+#else
+		btVector3 normal;
+		rayBoxOverlap = btRayAabb(raySource, rayTarget,bounds[0],bounds[1],param, normal);
+#endif
+
+		isLeafNode = rootNode->m_escapeIndex == -1;
+		
+		//PCK: unsigned instead of bool
+		if (isLeafNode && (rayBoxOverlap != 0))
+		{
+			nodeCallback->processNode(rootNode->m_subPart,rootNode->m_triangleIndex);
+		} 
+		
+		//PCK: unsigned instead of bool
+		if ((rayBoxOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->m_escapeIndex;
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (maxIterations < walkIterations)
+		maxIterations = walkIterations;
+
+}
+
+
+
+void	btQuantizedBvh::walkStacklessQuantizedTreeAgainstRay(btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget, const btVector3& aabbMin, const btVector3& aabbMax, int startNodeIndex,int endNodeIndex) const
+{
+	btAssert(m_useQuantization);
+	
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+	(void)subTreeSize;
+
+	const btQuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex];
+	int escapeIndex;
+	
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned boxBoxOverlap = 0;
+	unsigned rayBoxOverlap = 0;
+
+	btScalar lambda_max = 1.0;
+
+#ifdef RAYAABB2
+	btVector3 rayDirection = (rayTarget-raySource);
+	rayDirection.normalize ();
+	lambda_max = rayDirection.dot(rayTarget-raySource);
+	///what about division by zero? --> just set rayDirection[i] to 1.0
+	rayDirection[0] = rayDirection[0] == btScalar(0.0) ? btScalar(BT_LARGE_FLOAT) : btScalar(1.0) / rayDirection[0];
+	rayDirection[1] = rayDirection[1] == btScalar(0.0) ? btScalar(BT_LARGE_FLOAT) : btScalar(1.0) / rayDirection[1];
+	rayDirection[2] = rayDirection[2] == btScalar(0.0) ? btScalar(BT_LARGE_FLOAT) : btScalar(1.0) / rayDirection[2];
+	unsigned int sign[3] = { rayDirection[0] < 0.0, rayDirection[1] < 0.0, rayDirection[2] < 0.0};
+#endif
+
+	/* Quick pruning by quantized box */
+	btVector3 rayAabbMin = raySource;
+	btVector3 rayAabbMax = raySource;
+	rayAabbMin.setMin(rayTarget);
+	rayAabbMax.setMax(rayTarget);
+
+	/* Add box cast extents to bounding box */
+	rayAabbMin += aabbMin;
+	rayAabbMax += aabbMax;
+
+	unsigned short int quantizedQueryAabbMin[3];
+	unsigned short int quantizedQueryAabbMax[3];
+	quantizeWithClamp(quantizedQueryAabbMin,rayAabbMin,0);
+	quantizeWithClamp(quantizedQueryAabbMax,rayAabbMax,1);
+
+	while (curIndex < endNodeIndex)
+	{
+
+//#define VISUALLY_ANALYZE_BVH 1
+#ifdef VISUALLY_ANALYZE_BVH
+		//some code snippet to debugDraw aabb, to visually analyze bvh structure
+		static int drawPatch = 0;
+		//need some global access to a debugDrawer
+		extern btIDebugDraw* debugDrawerPtr;
+		if (curIndex==drawPatch)
+		{
+			btVector3 aabbMin,aabbMax;
+			aabbMin = unQuantize(rootNode->m_quantizedAabbMin);
+			aabbMax = unQuantize(rootNode->m_quantizedAabbMax);
+			btVector3	color(1,0,0);
+			debugDrawerPtr->drawAabb(aabbMin,aabbMax,color);
+		}
+#endif//VISUALLY_ANALYZE_BVH
+
+		//catch bugs in tree data
+		btAssert (walkIterations < subTreeSize);
+
+		walkIterations++;
+		//PCK: unsigned instead of bool
+		// only interested if this is closer than any previous hit
+		btScalar param = 1.0;
+		rayBoxOverlap = 0;
+		boxBoxOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+		if (boxBoxOverlap)
+		{
+			btVector3 bounds[2];
+			bounds[0] = unQuantize(rootNode->m_quantizedAabbMin);
+			bounds[1] = unQuantize(rootNode->m_quantizedAabbMax);
+			/* Add box cast extents */
+			bounds[0] -= aabbMax;
+			bounds[1] -= aabbMin;
+			btVector3 normal;
+#if 0
+			bool ra2 = btRayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0, lambda_max);
+			bool ra = btRayAabb (raySource, rayTarget, bounds[0], bounds[1], param, normal);
+			if (ra2 != ra)
+			{
+				printf("functions don't match\n");
+			}
+#endif
+#ifdef RAYAABB2
+			///careful with this check: need to check division by zero (above) and fix the unQuantize method
+			///thanks Joerg/hiker for the reproduction case!
+			///http://www.bulletphysics.com/Bullet/phpBB3/viewtopic.php?f=9&t=1858
+
+			//BT_PROFILE("btRayAabb2");
+			rayBoxOverlap = btRayAabb2 (raySource, rayDirection, sign, bounds, param, 0.0f, lambda_max);
+			
+#else
+			rayBoxOverlap = true;//btRayAabb(raySource, rayTarget, bounds[0], bounds[1], param, normal);
+#endif
+		}
+		
+		if (isLeafNode && rayBoxOverlap)
+		{
+			nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex());
+		}
+		
+		//PCK: unsigned instead of bool
+		if ((rayBoxOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (maxIterations < walkIterations)
+		maxIterations = walkIterations;
+
+}
+
+void	btQuantizedBvh::walkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const
+{
+	btAssert(m_useQuantization);
+	
+	int curIndex = startNodeIndex;
+	int walkIterations = 0;
+	int subTreeSize = endNodeIndex - startNodeIndex;
+	(void)subTreeSize;
+
+	const btQuantizedBvhNode* rootNode = &m_quantizedContiguousNodes[startNodeIndex];
+	int escapeIndex;
+	
+	bool isLeafNode;
+	//PCK: unsigned instead of bool
+	unsigned aabbOverlap;
+
+	while (curIndex < endNodeIndex)
+	{
+
+//#define VISUALLY_ANALYZE_BVH 1
+#ifdef VISUALLY_ANALYZE_BVH
+		//some code snippet to debugDraw aabb, to visually analyze bvh structure
+		static int drawPatch = 0;
+		//need some global access to a debugDrawer
+		extern btIDebugDraw* debugDrawerPtr;
+		if (curIndex==drawPatch)
+		{
+			btVector3 aabbMin,aabbMax;
+			aabbMin = unQuantize(rootNode->m_quantizedAabbMin);
+			aabbMax = unQuantize(rootNode->m_quantizedAabbMax);
+			btVector3	color(1,0,0);
+			debugDrawerPtr->drawAabb(aabbMin,aabbMax,color);
+		}
+#endif//VISUALLY_ANALYZE_BVH
+
+		//catch bugs in tree data
+		btAssert (walkIterations < subTreeSize);
+
+		walkIterations++;
+		//PCK: unsigned instead of bool
+		aabbOverlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,rootNode->m_quantizedAabbMin,rootNode->m_quantizedAabbMax);
+		isLeafNode = rootNode->isLeafNode();
+		
+		if (isLeafNode && aabbOverlap)
+		{
+			nodeCallback->processNode(rootNode->getPartId(),rootNode->getTriangleIndex());
+		} 
+		
+		//PCK: unsigned instead of bool
+		if ((aabbOverlap != 0) || isLeafNode)
+		{
+			rootNode++;
+			curIndex++;
+		} else
+		{
+			escapeIndex = rootNode->getEscapeIndex();
+			rootNode += escapeIndex;
+			curIndex += escapeIndex;
+		}
+	}
+	if (maxIterations < walkIterations)
+		maxIterations = walkIterations;
+
+}
+
+//This traversal can be called from Playstation 3 SPU
+void	btQuantizedBvh::walkStacklessQuantizedTreeCacheFriendly(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const
+{
+	btAssert(m_useQuantization);
+
+	int i;
+
+
+	for (i=0;i<this->m_SubtreeHeaders.size();i++)
+	{
+		const btBvhSubtreeInfo& subtree = m_SubtreeHeaders[i];
+
+		//PCK: unsigned instead of bool
+		unsigned overlap = testQuantizedAabbAgainstQuantizedAabb(quantizedQueryAabbMin,quantizedQueryAabbMax,subtree.m_quantizedAabbMin,subtree.m_quantizedAabbMax);
+		if (overlap != 0)
+		{
+			walkStacklessQuantizedTree(nodeCallback,quantizedQueryAabbMin,quantizedQueryAabbMax,
+				subtree.m_rootNodeIndex,
+				subtree.m_rootNodeIndex+subtree.m_subtreeSize);
+		}
+	}
+}
+
+
+void	btQuantizedBvh::reportRayOverlappingNodex (btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget) const
+{
+	reportBoxCastOverlappingNodex(nodeCallback,raySource,rayTarget,btVector3(0,0,0),btVector3(0,0,0));
+}
+
+
+void	btQuantizedBvh::reportBoxCastOverlappingNodex(btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget, const btVector3& aabbMin,const btVector3& aabbMax) const
+{
+	//always use stackless
+
+	if (m_useQuantization)
+	{
+		walkStacklessQuantizedTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex);
+	}
+	else
+	{
+		walkStacklessTreeAgainstRay(nodeCallback, raySource, rayTarget, aabbMin, aabbMax, 0, m_curNodeIndex);
+	}
+	/*
+	{
+		//recursive traversal
+		btVector3 qaabbMin = raySource;
+		btVector3 qaabbMax = raySource;
+		qaabbMin.setMin(rayTarget);
+		qaabbMax.setMax(rayTarget);
+		qaabbMin += aabbMin;
+		qaabbMax += aabbMax;
+		reportAabbOverlappingNodex(nodeCallback,qaabbMin,qaabbMax);
+	}
+	*/
+
+}
+
+
+void	btQuantizedBvh::swapLeafNodes(int i,int splitIndex)
+{
+	if (m_useQuantization)
+	{
+			btQuantizedBvhNode tmp = m_quantizedLeafNodes[i];
+			m_quantizedLeafNodes[i] = m_quantizedLeafNodes[splitIndex];
+			m_quantizedLeafNodes[splitIndex] = tmp;
+	} else
+	{
+			btOptimizedBvhNode tmp = m_leafNodes[i];
+			m_leafNodes[i] = m_leafNodes[splitIndex];
+			m_leafNodes[splitIndex] = tmp;
+	}
+}
+
+void	btQuantizedBvh::assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex)
+{
+	if (m_useQuantization)
+	{
+		m_quantizedContiguousNodes[internalNode] = m_quantizedLeafNodes[leafNodeIndex];
+	} else
+	{
+		m_contiguousNodes[internalNode] = m_leafNodes[leafNodeIndex];
+	}
+}
+
+//PCK: include
+#include <new>
+
+#if 0
+//PCK: consts
+static const unsigned BVH_ALIGNMENT = 16;
+static const unsigned BVH_ALIGNMENT_MASK = BVH_ALIGNMENT-1;
+
+static const unsigned BVH_ALIGNMENT_BLOCKS = 2;
+#endif
+
+
+unsigned int btQuantizedBvh::getAlignmentSerializationPadding()
+{
+	// I changed this to 0 since the extra padding is not needed or used.
+	return 0;//BVH_ALIGNMENT_BLOCKS * BVH_ALIGNMENT;
+}
+
+unsigned btQuantizedBvh::calculateSerializeBufferSize() const
+{
+	unsigned baseSize = sizeof(btQuantizedBvh) + getAlignmentSerializationPadding();
+	baseSize += sizeof(btBvhSubtreeInfo) * m_subtreeHeaderCount;
+	if (m_useQuantization)
+	{
+		return baseSize + m_curNodeIndex * sizeof(btQuantizedBvhNode);
+	}
+	return baseSize + m_curNodeIndex * sizeof(btOptimizedBvhNode);
+}
+
+bool btQuantizedBvh::serialize(void *o_alignedDataBuffer, unsigned /*i_dataBufferSize */, bool i_swapEndian) const
+{
+	btAssert(m_subtreeHeaderCount == m_SubtreeHeaders.size());
+	m_subtreeHeaderCount = m_SubtreeHeaders.size();
+
+/*	if (i_dataBufferSize < calculateSerializeBufferSize() || o_alignedDataBuffer == NULL || (((unsigned)o_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0))
+	{
+		///check alignedment for buffer?
+		btAssert(0);
+		return false;
+	}
+*/
+
+	btQuantizedBvh *targetBvh = (btQuantizedBvh *)o_alignedDataBuffer;
+
+	// construct the class so the virtual function table, etc will be set up
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (targetBvh) btQuantizedBvh;
+
+	if (i_swapEndian)
+	{
+		targetBvh->m_curNodeIndex = static_cast<int>(btSwapEndian(m_curNodeIndex));
+
+
+		btSwapVector3Endian(m_bvhAabbMin,targetBvh->m_bvhAabbMin);
+		btSwapVector3Endian(m_bvhAabbMax,targetBvh->m_bvhAabbMax);
+		btSwapVector3Endian(m_bvhQuantization,targetBvh->m_bvhQuantization);
+
+		targetBvh->m_traversalMode = (btTraversalMode)btSwapEndian(m_traversalMode);
+		targetBvh->m_subtreeHeaderCount = static_cast<int>(btSwapEndian(m_subtreeHeaderCount));
+	}
+	else
+	{
+		targetBvh->m_curNodeIndex = m_curNodeIndex;
+		targetBvh->m_bvhAabbMin = m_bvhAabbMin;
+		targetBvh->m_bvhAabbMax = m_bvhAabbMax;
+		targetBvh->m_bvhQuantization = m_bvhQuantization;
+		targetBvh->m_traversalMode = m_traversalMode;
+		targetBvh->m_subtreeHeaderCount = m_subtreeHeaderCount;
+	}
+
+	targetBvh->m_useQuantization = m_useQuantization;
+
+	unsigned char *nodeData = (unsigned char *)targetBvh;
+	nodeData += sizeof(btQuantizedBvh);
+	
+	unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	
+	int nodeCount = m_curNodeIndex;
+
+	if (m_useQuantization)
+	{
+		targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(btSwapEndian(m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex));
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+	
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2];
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1];
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2];
+
+				targetBvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex;
+
+
+			}
+		}
+		nodeData += sizeof(btQuantizedBvhNode) * nodeCount;
+
+		// this clears the pointer in the member variable it doesn't really do anything to the data
+		// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+		// so the memory (which is not freed) is left alone
+		targetBvh->m_quantizedContiguousNodes.initializeFromBuffer(NULL, 0, 0);
+	}
+	else
+	{
+		targetBvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				btSwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMinOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				btSwapVector3Endian(m_contiguousNodes[nodeIndex].m_aabbMaxOrg, targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(btSwapEndian(m_contiguousNodes[nodeIndex].m_escapeIndex));
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(btSwapEndian(m_contiguousNodes[nodeIndex].m_subPart));
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(btSwapEndian(m_contiguousNodes[nodeIndex].m_triangleIndex));
+			}
+		}
+		else
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg = m_contiguousNodes[nodeIndex].m_aabbMinOrg;
+				targetBvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg = m_contiguousNodes[nodeIndex].m_aabbMaxOrg;
+
+				targetBvh->m_contiguousNodes[nodeIndex].m_escapeIndex = m_contiguousNodes[nodeIndex].m_escapeIndex;
+				targetBvh->m_contiguousNodes[nodeIndex].m_subPart = m_contiguousNodes[nodeIndex].m_subPart;
+				targetBvh->m_contiguousNodes[nodeIndex].m_triangleIndex = m_contiguousNodes[nodeIndex].m_triangleIndex;
+			}
+		}
+		nodeData += sizeof(btOptimizedBvhNode) * nodeCount;
+
+		// this clears the pointer in the member variable it doesn't really do anything to the data
+		// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+		// so the memory (which is not freed) is left alone
+		targetBvh->m_contiguousNodes.initializeFromBuffer(NULL, 0, 0);
+	}
+
+	sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+
+	// Now serialize the subtree headers
+	targetBvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, m_subtreeHeaderCount, m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = btSwapEndian(m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(btSwapEndian(m_SubtreeHeaders[i].m_rootNodeIndex));
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(btSwapEndian(m_SubtreeHeaders[i].m_subtreeSize));
+		}
+	}
+	else
+	{
+		for (int i = 0; i < m_subtreeHeaderCount; i++)
+		{
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = (m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = (m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = (m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = (m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = (m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			targetBvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = (m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			targetBvh->m_SubtreeHeaders[i].m_rootNodeIndex = (m_SubtreeHeaders[i].m_rootNodeIndex);
+			targetBvh->m_SubtreeHeaders[i].m_subtreeSize = (m_SubtreeHeaders[i].m_subtreeSize);
+
+			// need to clear padding in destination buffer
+			targetBvh->m_SubtreeHeaders[i].m_padding[0] = 0;
+			targetBvh->m_SubtreeHeaders[i].m_padding[1] = 0;
+			targetBvh->m_SubtreeHeaders[i].m_padding[2] = 0;
+		}
+	}
+	nodeData += sizeof(btBvhSubtreeInfo) * m_subtreeHeaderCount;
+
+	// this clears the pointer in the member variable it doesn't really do anything to the data
+	// it does call the destructor on the contained objects, but they are all classes with no destructor defined
+	// so the memory (which is not freed) is left alone
+	targetBvh->m_SubtreeHeaders.initializeFromBuffer(NULL, 0, 0);
+
+	// this wipes the virtual function table pointer at the start of the buffer for the class
+	*((void**)o_alignedDataBuffer) = NULL;
+
+	return true;
+}
+
+btQuantizedBvh *btQuantizedBvh::deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian)
+{
+
+	if (i_alignedDataBuffer == NULL)// || (((unsigned)i_alignedDataBuffer & BVH_ALIGNMENT_MASK) != 0))
+	{
+		return NULL;
+	}
+	btQuantizedBvh *bvh = (btQuantizedBvh *)i_alignedDataBuffer;
+
+	if (i_swapEndian)
+	{
+		bvh->m_curNodeIndex = static_cast<int>(btSwapEndian(bvh->m_curNodeIndex));
+
+		btUnSwapVector3Endian(bvh->m_bvhAabbMin);
+		btUnSwapVector3Endian(bvh->m_bvhAabbMax);
+		btUnSwapVector3Endian(bvh->m_bvhQuantization);
+
+		bvh->m_traversalMode = (btTraversalMode)btSwapEndian(bvh->m_traversalMode);
+		bvh->m_subtreeHeaderCount = static_cast<int>(btSwapEndian(bvh->m_subtreeHeaderCount));
+	}
+
+	unsigned int calculatedBufSize = bvh->calculateSerializeBufferSize();
+	btAssert(calculatedBufSize <= i_dataBufferSize);
+
+	if (calculatedBufSize > i_dataBufferSize)
+	{
+		return NULL;
+	}
+
+	unsigned char *nodeData = (unsigned char *)bvh;
+	nodeData += sizeof(btQuantizedBvh);
+	
+	unsigned sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+	
+	int nodeCount = bvh->m_curNodeIndex;
+
+	// Must call placement new to fill in virtual function table, etc, but we don't want to overwrite most data, so call a special version of the constructor
+	// Also, m_leafNodes and m_quantizedLeafNodes will be initialized to default values by the constructor
+	new (bvh) btQuantizedBvh(*bvh, false);
+
+	if (bvh->m_useQuantization)
+	{
+		bvh->m_quantizedContiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[2]);
+
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[1]);
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2] = btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[2]);
+
+				bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = static_cast<int>(btSwapEndian(bvh->m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex));
+			}
+		}
+		nodeData += sizeof(btQuantizedBvhNode) * nodeCount;
+	}
+	else
+	{
+		bvh->m_contiguousNodes.initializeFromBuffer(nodeData, nodeCount, nodeCount);
+
+		if (i_swapEndian)
+		{
+			for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++)
+			{
+				btUnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMinOrg);
+				btUnSwapVector3Endian(bvh->m_contiguousNodes[nodeIndex].m_aabbMaxOrg);
+				
+				bvh->m_contiguousNodes[nodeIndex].m_escapeIndex = static_cast<int>(btSwapEndian(bvh->m_contiguousNodes[nodeIndex].m_escapeIndex));
+				bvh->m_contiguousNodes[nodeIndex].m_subPart = static_cast<int>(btSwapEndian(bvh->m_contiguousNodes[nodeIndex].m_subPart));
+				bvh->m_contiguousNodes[nodeIndex].m_triangleIndex = static_cast<int>(btSwapEndian(bvh->m_contiguousNodes[nodeIndex].m_triangleIndex));
+			}
+		}
+		nodeData += sizeof(btOptimizedBvhNode) * nodeCount;
+	}
+
+	sizeToAdd = 0;//(BVH_ALIGNMENT-((unsigned)nodeData & BVH_ALIGNMENT_MASK))&BVH_ALIGNMENT_MASK;
+	nodeData += sizeToAdd;
+
+	// Now serialize the subtree headers
+	bvh->m_SubtreeHeaders.initializeFromBuffer(nodeData, bvh->m_subtreeHeaderCount, bvh->m_subtreeHeaderCount);
+	if (i_swapEndian)
+	{
+		for (int i = 0; i < bvh->m_subtreeHeaderCount; i++)
+		{
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMin[2]);
+
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[0]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[1]);
+			bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2] = btSwapEndian(bvh->m_SubtreeHeaders[i].m_quantizedAabbMax[2]);
+
+			bvh->m_SubtreeHeaders[i].m_rootNodeIndex = static_cast<int>(btSwapEndian(bvh->m_SubtreeHeaders[i].m_rootNodeIndex));
+			bvh->m_SubtreeHeaders[i].m_subtreeSize = static_cast<int>(btSwapEndian(bvh->m_SubtreeHeaders[i].m_subtreeSize));
+		}
+	}
+
+	return bvh;
+}
+
+// Constructor that prevents btVector3's default constructor from being called
+btQuantizedBvh::btQuantizedBvh(btQuantizedBvh &self, bool /* ownsMemory */) :
+m_bvhAabbMin(self.m_bvhAabbMin),
+m_bvhAabbMax(self.m_bvhAabbMax),
+m_bvhQuantization(self.m_bvhQuantization),
+m_bulletVersion(BT_BULLET_VERSION)
+{
+
+}
+
+void btQuantizedBvh::deSerializeFloat(struct btQuantizedBvhFloatData& quantizedBvhFloatData)
+{
+	m_bvhAabbMax.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMax);
+	m_bvhAabbMin.deSerializeFloat(quantizedBvhFloatData.m_bvhAabbMin);
+	m_bvhQuantization.deSerializeFloat(quantizedBvhFloatData.m_bvhQuantization);
+
+	m_curNodeIndex = quantizedBvhFloatData.m_curNodeIndex;
+	m_useQuantization = quantizedBvhFloatData.m_useQuantization!=0;
+	
+	{
+		int numElem = quantizedBvhFloatData.m_numContiguousLeafNodes;
+		m_contiguousNodes.resize(numElem);
+
+		if (numElem)
+		{
+			btOptimizedBvhNodeFloatData* memPtr = quantizedBvhFloatData.m_contiguousNodesPtr;
+
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_contiguousNodes[i].m_aabbMaxOrg.deSerializeFloat(memPtr->m_aabbMaxOrg);
+				m_contiguousNodes[i].m_aabbMinOrg.deSerializeFloat(memPtr->m_aabbMinOrg);
+				m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex;
+				m_contiguousNodes[i].m_subPart = memPtr->m_subPart;
+				m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex;
+			}
+		}
+	}
+
+	{
+		int numElem = quantizedBvhFloatData.m_numQuantizedContiguousNodes;
+		m_quantizedContiguousNodes.resize(numElem);
+		
+		if (numElem)
+		{
+			btQuantizedBvhNodeData* memPtr = quantizedBvhFloatData.m_quantizedContiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex;
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+			}
+		}
+	}
+
+	m_traversalMode = btTraversalMode(quantizedBvhFloatData.m_traversalMode);
+	
+	{
+		int numElem = quantizedBvhFloatData.m_numSubtreeHeaders;
+		m_SubtreeHeaders.resize(numElem);
+		if (numElem)
+		{
+			btBvhSubtreeInfoData* memPtr = quantizedBvhFloatData.m_subTreeInfoPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ;
+				m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+				m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex;
+				m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize;
+			}
+		}
+	}
+}
+
+void btQuantizedBvh::deSerializeDouble(struct btQuantizedBvhDoubleData& quantizedBvhDoubleData)
+{
+	m_bvhAabbMax.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMax);
+	m_bvhAabbMin.deSerializeDouble(quantizedBvhDoubleData.m_bvhAabbMin);
+	m_bvhQuantization.deSerializeDouble(quantizedBvhDoubleData.m_bvhQuantization);
+
+	m_curNodeIndex = quantizedBvhDoubleData.m_curNodeIndex;
+	m_useQuantization = quantizedBvhDoubleData.m_useQuantization!=0;
+	
+	{
+		int numElem = quantizedBvhDoubleData.m_numContiguousLeafNodes;
+		m_contiguousNodes.resize(numElem);
+
+		if (numElem)
+		{
+			btOptimizedBvhNodeDoubleData* memPtr = quantizedBvhDoubleData.m_contiguousNodesPtr;
+
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_contiguousNodes[i].m_aabbMaxOrg.deSerializeDouble(memPtr->m_aabbMaxOrg);
+				m_contiguousNodes[i].m_aabbMinOrg.deSerializeDouble(memPtr->m_aabbMinOrg);
+				m_contiguousNodes[i].m_escapeIndex = memPtr->m_escapeIndex;
+				m_contiguousNodes[i].m_subPart = memPtr->m_subPart;
+				m_contiguousNodes[i].m_triangleIndex = memPtr->m_triangleIndex;
+			}
+		}
+	}
+
+	{
+		int numElem = quantizedBvhDoubleData.m_numQuantizedContiguousNodes;
+		m_quantizedContiguousNodes.resize(numElem);
+		
+		if (numElem)
+		{
+			btQuantizedBvhNodeData* memPtr = quantizedBvhDoubleData.m_quantizedContiguousNodesPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_quantizedContiguousNodes[i].m_escapeIndexOrTriangleIndex = memPtr->m_escapeIndexOrTriangleIndex;
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_quantizedContiguousNodes[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+			}
+		}
+	}
+
+	m_traversalMode = btTraversalMode(quantizedBvhDoubleData.m_traversalMode);
+	
+	{
+		int numElem = quantizedBvhDoubleData.m_numSubtreeHeaders;
+		m_SubtreeHeaders.resize(numElem);
+		if (numElem)
+		{
+			btBvhSubtreeInfoData* memPtr = quantizedBvhDoubleData.m_subTreeInfoPtr;
+			for (int i=0;i<numElem;i++,memPtr++)
+			{
+				m_SubtreeHeaders[i].m_quantizedAabbMax[0] = memPtr->m_quantizedAabbMax[0] ;
+				m_SubtreeHeaders[i].m_quantizedAabbMax[1] = memPtr->m_quantizedAabbMax[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMax[2] = memPtr->m_quantizedAabbMax[2];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[0] = memPtr->m_quantizedAabbMin[0];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[1] = memPtr->m_quantizedAabbMin[1];
+				m_SubtreeHeaders[i].m_quantizedAabbMin[2] = memPtr->m_quantizedAabbMin[2];
+				m_SubtreeHeaders[i].m_rootNodeIndex = memPtr->m_rootNodeIndex;
+				m_SubtreeHeaders[i].m_subtreeSize = memPtr->m_subtreeSize;
+			}
+		}
+	}
+
+}
+
+
+
+///fills the dataBuffer and returns the struct name (and 0 on failure)
+const char*	btQuantizedBvh::serialize(void* dataBuffer, btSerializer* serializer) const
+{
+	btAssert(0);
+	return 0;
+}
+
+
+
+
+
diff --git a/opencl/gpu_sat/host/btQuantizedBvh.h b/opencl/gpu_sat/host/btQuantizedBvh.h
new file mode 100644
index 000000000..45f55e5a3
--- /dev/null
+++ b/opencl/gpu_sat/host/btQuantizedBvh.h
@@ -0,0 +1,581 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_QUANTIZED_BVH_H
+#define BT_QUANTIZED_BVH_H
+
+class btSerializer;
+
+//#define DEBUG_CHECK_DEQUANTIZATION 1
+#ifdef DEBUG_CHECK_DEQUANTIZATION
+#ifdef __SPU__
+#define printf spu_printf
+#endif //__SPU__
+
+#include <stdio.h>
+#include <stdlib.h>
+#endif //DEBUG_CHECK_DEQUANTIZATION
+
+#include "BulletCommon/btVector3.h"
+#include "BulletCommon/btAlignedAllocator.h"
+
+#ifdef BT_USE_DOUBLE_PRECISION
+#define btQuantizedBvhData btQuantizedBvhDoubleData
+#define btOptimizedBvhNodeData btOptimizedBvhNodeDoubleData
+#define btQuantizedBvhDataName "btQuantizedBvhDoubleData"
+#else
+#define btQuantizedBvhData btQuantizedBvhFloatData
+#define btOptimizedBvhNodeData btOptimizedBvhNodeFloatData
+#define btQuantizedBvhDataName "btQuantizedBvhFloatData"
+#endif
+
+
+
+//http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrf__m128.asp
+
+
+//Note: currently we have 16 bytes per quantized node
+#define MAX_SUBTREE_SIZE_IN_BYTES  2048
+
+// 10 gives the potential for 1024 parts, with at most 2^21 (2097152) (minus one
+// actually) triangles each (since the sign bit is reserved
+#define MAX_NUM_PARTS_IN_BITS 10
+
+///btQuantizedBvhNode is a compressed aabb node, 16 bytes.
+///Node can be used for leafnode or internal node. Leafnodes can point to 32-bit triangle index (non-negative range).
+ATTRIBUTE_ALIGNED16	(struct) btQuantizedBvhNode
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes
+	int	m_escapeIndexOrTriangleIndex;
+
+	bool isLeafNode() const
+	{
+		//skipindex is negative (internal node), triangleindex >=0 (leafnode)
+		return (m_escapeIndexOrTriangleIndex >= 0);
+	}
+	int getEscapeIndex() const
+	{
+		btAssert(!isLeafNode());
+		return -m_escapeIndexOrTriangleIndex;
+	}
+	int	getTriangleIndex() const
+	{
+		btAssert(isLeafNode());
+		unsigned int x=0;
+		unsigned int y = (~(x&0))<<(31-MAX_NUM_PARTS_IN_BITS);
+		// Get only the lower bits where the triangle index is stored
+		return (m_escapeIndexOrTriangleIndex&~(y));
+	}
+	int	getPartId() const
+	{
+		btAssert(isLeafNode());
+		// Get only the highest bits where the part index is stored
+		return (m_escapeIndexOrTriangleIndex>>(31-MAX_NUM_PARTS_IN_BITS));
+	}
+}
+;
+
+/// btOptimizedBvhNode contains both internal and leaf node information.
+/// Total node size is 44 bytes / node. You can use the compressed version of 16 bytes.
+ATTRIBUTE_ALIGNED16 (struct) btOptimizedBvhNode
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	//32 bytes
+	btVector3	m_aabbMinOrg;
+	btVector3	m_aabbMaxOrg;
+
+	//4
+	int	m_escapeIndex;
+
+	//8
+	//for child nodes
+	int	m_subPart;
+	int	m_triangleIndex;
+
+//pad the size to 64 bytes
+	char	m_padding[20];
+};
+
+
+///btBvhSubtreeInfo provides info to gather a subtree of limited size
+ATTRIBUTE_ALIGNED16(class) btBvhSubtreeInfo
+{
+public:
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	//12 bytes
+	unsigned short int	m_quantizedAabbMin[3];
+	unsigned short int	m_quantizedAabbMax[3];
+	//4 bytes, points to the root of the subtree
+	int			m_rootNodeIndex;
+	//4 bytes
+	int			m_subtreeSize;
+	int			m_padding[3];
+
+	btBvhSubtreeInfo()
+	{
+		//memset(&m_padding[0], 0, sizeof(m_padding));
+	}
+
+
+	void	setAabbFromQuantizeNode(const btQuantizedBvhNode& quantizedNode)
+	{
+		m_quantizedAabbMin[0] = quantizedNode.m_quantizedAabbMin[0];
+		m_quantizedAabbMin[1] = quantizedNode.m_quantizedAabbMin[1];
+		m_quantizedAabbMin[2] = quantizedNode.m_quantizedAabbMin[2];
+		m_quantizedAabbMax[0] = quantizedNode.m_quantizedAabbMax[0];
+		m_quantizedAabbMax[1] = quantizedNode.m_quantizedAabbMax[1];
+		m_quantizedAabbMax[2] = quantizedNode.m_quantizedAabbMax[2];
+	}
+}
+;
+
+
+class btNodeOverlapCallback
+{
+public:
+	virtual ~btNodeOverlapCallback() {};
+
+	virtual void processNode(int subPart, int triangleIndex) = 0;
+};
+
+#include "BulletCommon/btAlignedAllocator.h"
+#include "BulletCommon/btAlignedObjectArray.h"
+
+
+
+///for code readability:
+typedef btAlignedObjectArray<btOptimizedBvhNode>	NodeArray;
+typedef btAlignedObjectArray<btQuantizedBvhNode>	QuantizedNodeArray;
+typedef btAlignedObjectArray<btBvhSubtreeInfo>		BvhSubtreeInfoArray;
+
+
+///The btQuantizedBvh class stores an AABB tree that can be quickly traversed on CPU and Cell SPU.
+///It is used by the btBvhTriangleMeshShape as midphase, and by the btMultiSapBroadphase.
+///It is recommended to use quantization for better performance and lower memory requirements.
+ATTRIBUTE_ALIGNED16(class) btQuantizedBvh
+{
+public:
+	enum btTraversalMode
+	{
+		TRAVERSAL_STACKLESS = 0,
+		TRAVERSAL_STACKLESS_CACHE_FRIENDLY,
+		TRAVERSAL_RECURSIVE
+	};
+
+protected:
+
+
+	btVector3			m_bvhAabbMin;
+	btVector3			m_bvhAabbMax;
+	btVector3			m_bvhQuantization;
+
+	int					m_bulletVersion;	//for serialization versioning. It could also be used to detect endianess.
+
+	int					m_curNodeIndex;
+	//quantization data
+	bool				m_useQuantization;
+
+
+
+	NodeArray			m_leafNodes;
+	NodeArray			m_contiguousNodes;
+	QuantizedNodeArray	m_quantizedLeafNodes;
+	QuantizedNodeArray	m_quantizedContiguousNodes;
+	
+	btTraversalMode	m_traversalMode;
+	BvhSubtreeInfoArray		m_SubtreeHeaders;
+
+	//This is only used for serialization so we don't have to add serialization directly to btAlignedObjectArray
+	mutable int m_subtreeHeaderCount;
+
+	
+
+
+
+	///two versions, one for quantized and normal nodes. This allows code-reuse while maintaining readability (no template/macro!)
+	///this might be refactored into a virtual, it is usually not calculated at run-time
+	void	setInternalNodeAabbMin(int nodeIndex, const btVector3& aabbMin)
+	{
+		if (m_useQuantization)
+		{
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[0] ,aabbMin,0);
+		} else
+		{
+			m_contiguousNodes[nodeIndex].m_aabbMinOrg = aabbMin;
+
+		}
+	}
+	void	setInternalNodeAabbMax(int nodeIndex,const btVector3& aabbMax)
+	{
+		if (m_useQuantization)
+		{
+			quantize(&m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[0],aabbMax,1);
+		} else
+		{
+			m_contiguousNodes[nodeIndex].m_aabbMaxOrg = aabbMax;
+		}
+	}
+
+	btVector3 getAabbMin(int nodeIndex) const
+	{
+		if (m_useQuantization)
+		{
+			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMin[0]);
+		}
+		//non-quantized
+		return m_leafNodes[nodeIndex].m_aabbMinOrg;
+
+	}
+	btVector3 getAabbMax(int nodeIndex) const
+	{
+		if (m_useQuantization)
+		{
+			return unQuantize(&m_quantizedLeafNodes[nodeIndex].m_quantizedAabbMax[0]);
+		} 
+		//non-quantized
+		return m_leafNodes[nodeIndex].m_aabbMaxOrg;
+		
+	}
+
+	
+	void	setInternalNodeEscapeIndex(int nodeIndex, int escapeIndex)
+	{
+		if (m_useQuantization)
+		{
+			m_quantizedContiguousNodes[nodeIndex].m_escapeIndexOrTriangleIndex = -escapeIndex;
+		} 
+		else
+		{
+			m_contiguousNodes[nodeIndex].m_escapeIndex = escapeIndex;
+		}
+
+	}
+
+	void mergeInternalNodeAabb(int nodeIndex,const btVector3& newAabbMin,const btVector3& newAabbMax) 
+	{
+		if (m_useQuantization)
+		{
+			unsigned short int quantizedAabbMin[3];
+			unsigned short int quantizedAabbMax[3];
+			quantize(quantizedAabbMin,newAabbMin,0);
+			quantize(quantizedAabbMax,newAabbMax,1);
+			for (int i=0;i<3;i++)
+			{
+				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] > quantizedAabbMin[i])
+					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMin[i] = quantizedAabbMin[i];
+
+				if (m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] < quantizedAabbMax[i])
+					m_quantizedContiguousNodes[nodeIndex].m_quantizedAabbMax[i] = quantizedAabbMax[i];
+
+			}
+		} else
+		{
+			//non-quantized
+			m_contiguousNodes[nodeIndex].m_aabbMinOrg.setMin(newAabbMin);
+			m_contiguousNodes[nodeIndex].m_aabbMaxOrg.setMax(newAabbMax);		
+		}
+	}
+
+	void	swapLeafNodes(int firstIndex,int secondIndex);
+
+	void	assignInternalNodeFromLeafNode(int internalNode,int leafNodeIndex);
+
+protected:
+
+	
+
+	void	buildTree	(int startIndex,int endIndex);
+
+	int	calcSplittingAxis(int startIndex,int endIndex);
+
+	int	sortAndCalcSplittingIndex(int startIndex,int endIndex,int splitAxis);
+	
+	void	walkStacklessTree(btNodeOverlapCallback* nodeCallback,const btVector3& aabbMin,const btVector3& aabbMax) const;
+
+	void	walkStacklessQuantizedTreeAgainstRay(btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget, const btVector3& aabbMin, const btVector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
+	void	walkStacklessQuantizedTree(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax,int startNodeIndex,int endNodeIndex) const;
+	void	walkStacklessTreeAgainstRay(btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget, const btVector3& aabbMin, const btVector3& aabbMax, int startNodeIndex,int endNodeIndex) const;
+
+	///tree traversal designed for small-memory processors like PS3 SPU
+	void	walkStacklessQuantizedTreeCacheFriendly(btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+
+	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
+	void	walkRecursiveQuantizedTreeAgainstQueryAabb(const btQuantizedBvhNode* currentNode,btNodeOverlapCallback* nodeCallback,unsigned short int* quantizedQueryAabbMin,unsigned short int* quantizedQueryAabbMax) const;
+
+	///use the 16-byte stackless 'skipindex' node tree to do a recursive traversal
+	void	walkRecursiveQuantizedTreeAgainstQuantizedTree(const btQuantizedBvhNode* treeNodeA,const btQuantizedBvhNode* treeNodeB,btNodeOverlapCallback* nodeCallback) const;
+	
+
+
+
+	void	updateSubtreeHeaders(int leftChildNodexIndex,int rightChildNodexIndex);
+
+public:
+	
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	btQuantizedBvh();
+
+	virtual ~btQuantizedBvh();
+
+	
+	///***************************************** expert/internal use only *************************
+	void	setQuantizationValues(const btVector3& bvhAabbMin,const btVector3& bvhAabbMax,btScalar quantizationMargin=btScalar(1.0));
+	QuantizedNodeArray&	getLeafNodeArray() {			return	m_quantizedLeafNodes;	}
+	///buildInternal is expert use only: assumes that setQuantizationValues and LeafNodeArray are initialized
+	void	buildInternal();
+	///***************************************** expert/internal use only *************************
+
+	void	reportAabbOverlappingNodex(btNodeOverlapCallback* nodeCallback,const btVector3& aabbMin,const btVector3& aabbMax) const;
+	void	reportRayOverlappingNodex (btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget) const;
+	void	reportBoxCastOverlappingNodex(btNodeOverlapCallback* nodeCallback, const btVector3& raySource, const btVector3& rayTarget, const btVector3& aabbMin,const btVector3& aabbMax) const;
+
+		SIMD_FORCE_INLINE void quantize(unsigned short* out, const btVector3& point,int isMax) const
+	{
+
+		btAssert(m_useQuantization);
+
+		btAssert(point.getX() <= m_bvhAabbMax.getX());
+		btAssert(point.getY() <= m_bvhAabbMax.getY());
+		btAssert(point.getZ() <= m_bvhAabbMax.getZ());
+
+		btAssert(point.getX() >= m_bvhAabbMin.getX());
+		btAssert(point.getY() >= m_bvhAabbMin.getY());
+		btAssert(point.getZ() >= m_bvhAabbMin.getZ());
+
+		btVector3 v = (point - m_bvhAabbMin) * m_bvhQuantization;
+		///Make sure rounding is done in a way that unQuantize(quantizeWithClamp(...)) is conservative
+		///end-points always set the first bit, so that they are sorted properly (so that neighbouring AABBs overlap properly)
+		///@todo: double-check this
+		if (isMax)
+		{
+			out[0] = (unsigned short) (((unsigned short)(v.getX()+btScalar(1.)) | 1));
+			out[1] = (unsigned short) (((unsigned short)(v.getY()+btScalar(1.)) | 1));
+			out[2] = (unsigned short) (((unsigned short)(v.getZ()+btScalar(1.)) | 1));
+		} else
+		{
+			out[0] = (unsigned short) (((unsigned short)(v.getX()) & 0xfffe));
+			out[1] = (unsigned short) (((unsigned short)(v.getY()) & 0xfffe));
+			out[2] = (unsigned short) (((unsigned short)(v.getZ()) & 0xfffe));
+		}
+
+
+#ifdef DEBUG_CHECK_DEQUANTIZATION
+		btVector3 newPoint = unQuantize(out);
+		if (isMax)
+		{
+			if (newPoint.getX() < point.getX())
+			{
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+			}
+			if (newPoint.getY() < point.getY())
+			{
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+			}
+			if (newPoint.getZ() < point.getZ())
+			{
+
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+			}
+		} else
+		{
+			if (newPoint.getX() > point.getX())
+			{
+				printf("unconservative X, diffX = %f, oldX=%f,newX=%f\n",newPoint.getX()-point.getX(), newPoint.getX(),point.getX());
+			}
+			if (newPoint.getY() > point.getY())
+			{
+				printf("unconservative Y, diffY = %f, oldY=%f,newY=%f\n",newPoint.getY()-point.getY(), newPoint.getY(),point.getY());
+			}
+			if (newPoint.getZ() > point.getZ())
+			{
+				printf("unconservative Z, diffZ = %f, oldZ=%f,newZ=%f\n",newPoint.getZ()-point.getZ(), newPoint.getZ(),point.getZ());
+			}
+		}
+#endif //DEBUG_CHECK_DEQUANTIZATION
+
+	}
+
+
+	SIMD_FORCE_INLINE void quantizeWithClamp(unsigned short* out, const btVector3& point2,int isMax) const
+	{
+
+		btAssert(m_useQuantization);
+
+		btVector3 clampedPoint(point2);
+		clampedPoint.setMax(m_bvhAabbMin);
+		clampedPoint.setMin(m_bvhAabbMax);
+
+		quantize(out,clampedPoint,isMax);
+
+	}
+	
+	SIMD_FORCE_INLINE btVector3	unQuantize(const unsigned short* vecIn) const
+	{
+			btVector3	vecOut;
+			vecOut.setValue(
+			(btScalar)(vecIn[0]) / (m_bvhQuantization.getX()),
+			(btScalar)(vecIn[1]) / (m_bvhQuantization.getY()),
+			(btScalar)(vecIn[2]) / (m_bvhQuantization.getZ()));
+			vecOut += m_bvhAabbMin;
+			return vecOut;
+	}
+
+	///setTraversalMode let's you choose between stackless, recursive or stackless cache friendly tree traversal. Note this is only implemented for quantized trees.
+	void	setTraversalMode(btTraversalMode	traversalMode)
+	{
+		m_traversalMode = traversalMode;
+	}
+
+
+	SIMD_FORCE_INLINE QuantizedNodeArray&	getQuantizedNodeArray()
+	{	
+		return	m_quantizedContiguousNodes;
+	}
+
+
+	SIMD_FORCE_INLINE BvhSubtreeInfoArray&	getSubtreeInfoArray()
+	{
+		return m_SubtreeHeaders;
+	}
+
+////////////////////////////////////////////////////////////////////
+
+	/////Calculate space needed to store BVH for serialization
+	unsigned calculateSerializeBufferSize() const;
+
+	/// Data buffer MUST be 16 byte aligned
+	virtual bool serialize(void *o_alignedDataBuffer, unsigned i_dataBufferSize, bool i_swapEndian) const;
+
+	///deSerializeInPlace loads and initializes a BVH from a buffer in memory 'in place'
+	static btQuantizedBvh *deSerializeInPlace(void *i_alignedDataBuffer, unsigned int i_dataBufferSize, bool i_swapEndian);
+
+	static unsigned int getAlignmentSerializationPadding();
+//////////////////////////////////////////////////////////////////////
+
+	
+	virtual	int	calculateSerializeBufferSizeNew() const;
+
+	///fills the dataBuffer and returns the struct name (and 0 on failure)
+	virtual	const char*	serialize(void* dataBuffer, btSerializer* serializer) const;
+
+	virtual	void deSerializeFloat(struct btQuantizedBvhFloatData& quantizedBvhFloatData);
+
+	virtual	void deSerializeDouble(struct btQuantizedBvhDoubleData& quantizedBvhDoubleData);
+
+
+////////////////////////////////////////////////////////////////////
+
+	SIMD_FORCE_INLINE bool isQuantized()
+	{
+		return m_useQuantization;
+	}
+
+private:
+	// Special "copy" constructor that allows for in-place deserialization
+	// Prevents btVector3's default constructor from being called, but doesn't inialize much else
+	// ownsMemory should most likely be false if deserializing, and if you are not, don't call this (it also changes the function signature, which we need)
+	btQuantizedBvh(btQuantizedBvh &other, bool ownsMemory);
+
+}
+;
+
+
+struct	btBvhSubtreeInfoData
+{
+	int			m_rootNodeIndex;
+	int			m_subtreeSize;
+	unsigned short m_quantizedAabbMin[3];
+	unsigned short m_quantizedAabbMax[3];
+};
+
+struct btOptimizedBvhNodeFloatData
+{
+	btVector3FloatData	m_aabbMinOrg;
+	btVector3FloatData	m_aabbMaxOrg;
+	int	m_escapeIndex;
+	int	m_subPart;
+	int	m_triangleIndex;
+	char m_pad[4];
+};
+
+struct btOptimizedBvhNodeDoubleData
+{
+	btVector3DoubleData	m_aabbMinOrg;
+	btVector3DoubleData	m_aabbMaxOrg;
+	int	m_escapeIndex;
+	int	m_subPart;
+	int	m_triangleIndex;
+	char	m_pad[4];
+};
+
+
+struct btQuantizedBvhNodeData
+{
+	unsigned short m_quantizedAabbMin[3];
+	unsigned short m_quantizedAabbMax[3];
+	int	m_escapeIndexOrTriangleIndex;
+};
+
+struct	btQuantizedBvhFloatData
+{
+	btVector3FloatData			m_bvhAabbMin;
+	btVector3FloatData			m_bvhAabbMax;
+	btVector3FloatData			m_bvhQuantization;
+	int					m_curNodeIndex;
+	int					m_useQuantization;
+	int					m_numContiguousLeafNodes;
+	int					m_numQuantizedContiguousNodes;
+	btOptimizedBvhNodeFloatData	*m_contiguousNodesPtr;
+	btQuantizedBvhNodeData		*m_quantizedContiguousNodesPtr;
+	btBvhSubtreeInfoData	*m_subTreeInfoPtr;
+	int					m_traversalMode;
+	int					m_numSubtreeHeaders;
+	
+};
+
+struct	btQuantizedBvhDoubleData
+{
+	btVector3DoubleData			m_bvhAabbMin;
+	btVector3DoubleData			m_bvhAabbMax;
+	btVector3DoubleData			m_bvhQuantization;
+	int							m_curNodeIndex;
+	int							m_useQuantization;
+	int							m_numContiguousLeafNodes;
+	int							m_numQuantizedContiguousNodes;
+	btOptimizedBvhNodeDoubleData	*m_contiguousNodesPtr;
+	btQuantizedBvhNodeData			*m_quantizedContiguousNodesPtr;
+
+	int							m_traversalMode;
+	int							m_numSubtreeHeaders;
+	btBvhSubtreeInfoData		*m_subTreeInfoPtr;
+};
+
+
+SIMD_FORCE_INLINE	int	btQuantizedBvh::calculateSerializeBufferSizeNew() const
+{
+	return sizeof(btQuantizedBvhData);
+}
+
+
+
+#endif //BT_QUANTIZED_BVH_H
diff --git a/opencl/gpu_sat/host/btStridingMeshInterface.cpp b/opencl/gpu_sat/host/btStridingMeshInterface.cpp
new file mode 100644
index 000000000..298f6cbf7
--- /dev/null
+++ b/opencl/gpu_sat/host/btStridingMeshInterface.cpp
@@ -0,0 +1,214 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btStridingMeshInterface.h"
+
+
+btStridingMeshInterface::~btStridingMeshInterface()
+{
+
+}
+
+
+void	btStridingMeshInterface::InternalProcessAllTriangles(btInternalTriangleIndexCallback* callback,const btVector3& aabbMin,const btVector3& aabbMax) const
+{
+	(void)aabbMin;
+	(void)aabbMax;
+	int numtotalphysicsverts = 0;
+	int part,graphicssubparts = getNumSubParts();
+	const unsigned char * vertexbase;
+	const unsigned char * indexbase;
+	int indexstride;
+	PHY_ScalarType type;
+	PHY_ScalarType gfxindextype;
+	int stride,numverts,numtriangles;
+	int gfxindex;
+	btVector3 triangle[3];
+
+	btVector3 meshScaling = getScaling();
+
+	///if the number of parts is big, the performance might drop due to the innerloop switch on indextype
+	for (part=0;part<graphicssubparts ;part++)
+	{
+		getLockedReadOnlyVertexIndexBase(&vertexbase,numverts,type,stride,&indexbase,indexstride,numtriangles,gfxindextype,part);
+		numtotalphysicsverts+=numtriangles*3; //upper bound
+
+		///unlike that developers want to pass in double-precision meshes in single-precision Bullet build
+		///so disable this feature by default
+		///see patch http://code.google.com/p/bullet/issues/detail?id=213
+
+		switch (type)
+		{
+		case PHY_FLOAT:
+		 {
+
+			 float* graphicsbase;
+
+			 switch (gfxindextype)
+			 {
+			 case PHY_INTEGER:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			 case PHY_SHORT:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			case PHY_UCHAR:
+				 {
+					 for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+					 {
+						 unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
+						 graphicsbase = (float*)(vertexbase+tri_indices[0]*stride);
+						 triangle[0].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[1]*stride);
+						 triangle[1].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 graphicsbase = (float*)(vertexbase+tri_indices[2]*stride);
+						 triangle[2].setValue(graphicsbase[0]*meshScaling.getX(),graphicsbase[1]*meshScaling.getY(),	graphicsbase[2]*meshScaling.getZ());
+						 callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+					 }
+					 break;
+				 }
+			 default:
+				 btAssert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+			 }
+			 break;
+		 }
+
+		case PHY_DOUBLE:
+			{
+				double* graphicsbase;
+
+				switch (gfxindextype)
+				{
+				case PHY_INTEGER:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned int* tri_indices= (unsigned int*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),(btScalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),  (btScalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),  (btScalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				case PHY_SHORT:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned short int* tri_indices= (unsigned short int*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),(btScalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),  (btScalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),  (btScalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				case PHY_UCHAR:
+					{
+						for (gfxindex=0;gfxindex<numtriangles;gfxindex++)
+						{
+							unsigned char* tri_indices= (unsigned char*)(indexbase+gfxindex*indexstride);
+							graphicsbase = (double*)(vertexbase+tri_indices[0]*stride);
+							triangle[0].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),(btScalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[1]*stride);
+							triangle[1].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),  (btScalar)graphicsbase[2]*meshScaling.getZ());
+							graphicsbase = (double*)(vertexbase+tri_indices[2]*stride);
+							triangle[2].setValue((btScalar)graphicsbase[0]*meshScaling.getX(),(btScalar)graphicsbase[1]*meshScaling.getY(),  (btScalar)graphicsbase[2]*meshScaling.getZ());
+							callback->internalProcessTriangleIndex(triangle,part,gfxindex);
+						}
+						break;
+					}
+				default:
+					btAssert((gfxindextype == PHY_INTEGER) || (gfxindextype == PHY_SHORT));
+				}
+				break;
+			}
+		default:
+			btAssert((type == PHY_FLOAT) || (type == PHY_DOUBLE));
+		}
+
+		unLockReadOnlyVertexBase(part);
+	}
+}
+
+void	btStridingMeshInterface::calculateAabbBruteForce(btVector3& aabbMin,btVector3& aabbMax)
+{
+
+	struct	AabbCalculationCallback : public btInternalTriangleIndexCallback
+	{
+		btVector3	m_aabbMin;
+		btVector3	m_aabbMax;
+
+		AabbCalculationCallback()
+		{
+			m_aabbMin.setValue(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+			m_aabbMax.setValue(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT));
+		}
+
+		virtual void internalProcessTriangleIndex(btVector3* triangle,int partId,int  triangleIndex)
+		{
+			(void)partId;
+			(void)triangleIndex;
+
+			m_aabbMin.setMin(triangle[0]);
+			m_aabbMax.setMax(triangle[0]);
+			m_aabbMin.setMin(triangle[1]);
+			m_aabbMax.setMax(triangle[1]);
+			m_aabbMin.setMin(triangle[2]);
+			m_aabbMax.setMax(triangle[2]);
+		}
+	};
+
+	//first calculate the total aabb for all triangles
+	AabbCalculationCallback	aabbCallback;
+	aabbMin.setValue(btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT),btScalar(-BT_LARGE_FLOAT));
+	aabbMax.setValue(btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT),btScalar(BT_LARGE_FLOAT));
+	InternalProcessAllTriangles(&aabbCallback,aabbMin,aabbMax);
+
+	aabbMin = aabbCallback.m_aabbMin;
+	aabbMax = aabbCallback.m_aabbMax;
+}
+
+
diff --git a/opencl/gpu_sat/host/btStridingMeshInterface.h b/opencl/gpu_sat/host/btStridingMeshInterface.h
new file mode 100644
index 000000000..b457df484
--- /dev/null
+++ b/opencl/gpu_sat/host/btStridingMeshInterface.h
@@ -0,0 +1,167 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_STRIDING_MESHINTERFACE_H
+#define BT_STRIDING_MESHINTERFACE_H
+
+#include "BulletCommon/btVector3.h"
+#include "btTriangleCallback.h"
+//#include "btConcaveShape.h"
+
+
+enum  	PHY_ScalarType { 
+  PHY_FLOAT, PHY_DOUBLE, PHY_INTEGER, PHY_SHORT, 
+  PHY_FIXEDPOINT88, PHY_UCHAR 
+};
+
+
+///	The btStridingMeshInterface is the interface class for high performance generic access to triangle meshes, used in combination with btBvhTriangleMeshShape and some other collision shapes.
+/// Using index striding of 3*sizeof(integer) it can use triangle arrays, using index striding of 1*sizeof(integer) it can handle triangle strips.
+/// It allows for sharing graphics and collision meshes. Also it provides locking/unlocking of graphics meshes that are in gpu memory.
+ATTRIBUTE_ALIGNED16(class ) btStridingMeshInterface
+{
+	protected:
+	
+		btVector3 m_scaling;
+
+	public:
+		BT_DECLARE_ALIGNED_ALLOCATOR();
+		
+		btStridingMeshInterface() :m_scaling(btScalar(1.),btScalar(1.),btScalar(1.))
+		{
+
+		}
+
+		virtual ~btStridingMeshInterface();
+
+
+
+		virtual void	InternalProcessAllTriangles(btInternalTriangleIndexCallback* callback,const btVector3& aabbMin,const btVector3& aabbMax) const;
+
+		///brute force method to calculate aabb
+		void	calculateAabbBruteForce(btVector3& aabbMin,btVector3& aabbMax);
+
+		/// get read and write access to a subpart of a triangle mesh
+		/// this subpart has a continuous array of vertices and indices
+		/// in this way the mesh can be handled as chunks of memory with striding
+		/// very similar to OpenGL vertexarray support
+		/// make a call to unLockVertexBase when the read and write access is finished	
+		virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0)=0;
+		
+		virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& stride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const=0;
+	
+		/// unLockVertexBase finishes the access to a subpart of the triangle mesh
+		/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
+		virtual void	unLockVertexBase(int subpart)=0;
+
+		virtual void	unLockReadOnlyVertexBase(int subpart) const=0;
+
+
+		/// getNumSubParts returns the number of seperate subparts
+		/// each subpart has a continuous array of vertices and indices
+		virtual int		getNumSubParts() const=0;
+
+		virtual void	preallocateVertices(int numverts)=0;
+		virtual void	preallocateIndices(int numindices)=0;
+
+		virtual bool	hasPremadeAabb() const { return false; }
+		virtual void	setPremadeAabb(const btVector3& aabbMin, const btVector3& aabbMax ) const
+                {
+                        (void) aabbMin;
+                        (void) aabbMax;
+                }
+		virtual void	getPremadeAabb(btVector3* aabbMin, btVector3* aabbMax ) const
+        {
+            (void) aabbMin;
+            (void) aabbMax;
+        }
+
+		const btVector3&	getScaling() const {
+			return m_scaling;
+		}
+		void	setScaling(const btVector3& scaling)
+		{
+			m_scaling = scaling;
+		}
+
+		virtual	int	calculateSerializeBufferSize() const;
+
+		///fills the dataBuffer and returns the struct name (and 0 on failure)
+		//virtual	const char*	serialize(void* dataBuffer, btSerializer* serializer) const;
+
+
+};
+
+struct	btIntIndexData
+{
+	int	m_value;
+};
+
+struct	btShortIntIndexData
+{
+	short m_value;
+	char m_pad[2];
+};
+
+struct	btShortIntIndexTripletData
+{
+	short	m_values[3];
+	char	m_pad[2];
+};
+
+struct	btCharIndexTripletData
+{
+	unsigned char m_values[3];
+	char	m_pad;
+};
+
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btMeshPartData
+{
+	btVector3FloatData			*m_vertices3f;
+	btVector3DoubleData			*m_vertices3d;
+
+	btIntIndexData				*m_indices32;
+	btShortIntIndexTripletData	*m_3indices16;
+	btCharIndexTripletData		*m_3indices8;
+
+	btShortIntIndexData			*m_indices16;//backwards compatibility
+
+	int                     m_numTriangles;//length of m_indices = m_numTriangles
+	int                     m_numVertices;
+};
+
+
+///do not change those serialization structures, it requires an updated sBulletDNAstr/sBulletDNAstr64
+struct	btStridingMeshInterfaceData
+{
+	btMeshPartData	*m_meshPartsPtr;
+	btVector3FloatData	m_scaling;
+	int	m_numMeshParts;
+	char m_padding[4];
+};
+
+
+
+
+SIMD_FORCE_INLINE	int	btStridingMeshInterface::calculateSerializeBufferSize() const
+{
+	return sizeof(btStridingMeshInterfaceData);
+}
+
+
+
+#endif //BT_STRIDING_MESHINTERFACE_H
diff --git a/opencl/gpu_sat/host/btTriangleCallback.cpp b/opencl/gpu_sat/host/btTriangleCallback.cpp
new file mode 100644
index 000000000..f558bf6d2
--- /dev/null
+++ b/opencl/gpu_sat/host/btTriangleCallback.cpp
@@ -0,0 +1,28 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btTriangleCallback.h"
+
+btTriangleCallback::~btTriangleCallback()
+{
+
+}
+
+
+btInternalTriangleIndexCallback::~btInternalTriangleIndexCallback()
+{
+
+}
+
diff --git a/opencl/gpu_sat/host/btTriangleCallback.h b/opencl/gpu_sat/host/btTriangleCallback.h
new file mode 100644
index 000000000..e9ce72ffb
--- /dev/null
+++ b/opencl/gpu_sat/host/btTriangleCallback.h
@@ -0,0 +1,42 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_TRIANGLE_CALLBACK_H
+#define BT_TRIANGLE_CALLBACK_H
+
+#include "BulletCommon/btVector3.h"
+
+
+///The btTriangleCallback provides a callback for each overlapping triangle when calling processAllTriangles.
+///This callback is called by processAllTriangles for all btConcaveShape derived class, such as  btBvhTriangleMeshShape, btStaticPlaneShape and btHeightfieldTerrainShape.
+class btTriangleCallback
+{
+public:
+
+	virtual ~btTriangleCallback();
+	virtual void processTriangle(btVector3* triangle, int partId, int triangleIndex) = 0;
+};
+
+class btInternalTriangleIndexCallback
+{
+public:
+
+	virtual ~btInternalTriangleIndexCallback();
+	virtual void internalProcessTriangleIndex(btVector3* triangle,int partId,int  triangleIndex) = 0;
+};
+
+
+
+#endif //BT_TRIANGLE_CALLBACK_H
diff --git a/opencl/gpu_sat/host/btTriangleIndexVertexArray.cpp b/opencl/gpu_sat/host/btTriangleIndexVertexArray.cpp
new file mode 100644
index 000000000..a665024cb
--- /dev/null
+++ b/opencl/gpu_sat/host/btTriangleIndexVertexArray.cpp
@@ -0,0 +1,95 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btTriangleIndexVertexArray.h"
+
+btTriangleIndexVertexArray::btTriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,btScalar* vertexBase,int vertexStride)
+: m_hasAabb(0)
+{
+	btIndexedMesh mesh;
+
+	mesh.m_numTriangles = numTriangles;
+	mesh.m_triangleIndexBase = (const unsigned char *)triangleIndexBase;
+	mesh.m_triangleIndexStride = triangleIndexStride;
+	mesh.m_numVertices = numVertices;
+	mesh.m_vertexBase = (const unsigned char *)vertexBase;
+	mesh.m_vertexStride = vertexStride;
+
+	addIndexedMesh(mesh);
+
+}
+
+btTriangleIndexVertexArray::~btTriangleIndexVertexArray()
+{
+
+}
+
+void	btTriangleIndexVertexArray::getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart)
+{
+	btAssert(subpart< getNumSubParts() );
+
+	btIndexedMesh& mesh = m_indexedMeshes[subpart];
+
+	numverts = mesh.m_numVertices;
+	(*vertexbase) = (unsigned char *) mesh.m_vertexBase;
+
+   type = mesh.m_vertexType;
+
+	vertexStride = mesh.m_vertexStride;
+
+	numfaces = mesh.m_numTriangles;
+
+	(*indexbase) = (unsigned char *)mesh.m_triangleIndexBase;
+	indexstride = mesh.m_triangleIndexStride;
+	indicestype = mesh.m_indexType;
+}
+
+void	btTriangleIndexVertexArray::getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart) const
+{
+	const btIndexedMesh& mesh = m_indexedMeshes[subpart];
+
+	numverts = mesh.m_numVertices;
+	(*vertexbase) = (const unsigned char *)mesh.m_vertexBase;
+
+   type = mesh.m_vertexType;
+   
+	vertexStride = mesh.m_vertexStride;
+
+	numfaces = mesh.m_numTriangles;
+	(*indexbase) = (const unsigned char *)mesh.m_triangleIndexBase;
+	indexstride = mesh.m_triangleIndexStride;
+	indicestype = mesh.m_indexType;
+}
+
+bool	btTriangleIndexVertexArray::hasPremadeAabb() const
+{
+	return (m_hasAabb == 1);
+}
+
+
+void	btTriangleIndexVertexArray::setPremadeAabb(const btVector3& aabbMin, const btVector3& aabbMax ) const
+{
+	m_aabbMin = aabbMin;
+	m_aabbMax = aabbMax;
+	m_hasAabb = 1; // this is intentionally an int see notes in header
+}
+
+void	btTriangleIndexVertexArray::getPremadeAabb(btVector3* aabbMin, btVector3* aabbMax ) const
+{
+	*aabbMin = m_aabbMin;
+	*aabbMax = m_aabbMax;
+}
+
+
diff --git a/opencl/gpu_sat/host/btTriangleIndexVertexArray.h b/opencl/gpu_sat/host/btTriangleIndexVertexArray.h
new file mode 100644
index 000000000..bc6f05e26
--- /dev/null
+++ b/opencl/gpu_sat/host/btTriangleIndexVertexArray.h
@@ -0,0 +1,133 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_TRIANGLE_INDEX_VERTEX_ARRAY_H
+#define BT_TRIANGLE_INDEX_VERTEX_ARRAY_H
+
+#include "btStridingMeshInterface.h"
+#include "BulletCommon/btAlignedObjectArray.h"
+#include "BulletCommon/btScalar.h"
+
+
+///The btIndexedMesh indexes a single vertex and index array. Multiple btIndexedMesh objects can be passed into a btTriangleIndexVertexArray using addIndexedMesh.
+///Instead of the number of indices, we pass the number of triangles.
+ATTRIBUTE_ALIGNED16( struct)	btIndexedMesh
+{
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+   int                     m_numTriangles;
+   const unsigned char *   m_triangleIndexBase;
+   // Size in byte of the indices for one triangle (3*sizeof(index_type) if the indices are tightly packed)
+   int                     m_triangleIndexStride;
+   int                     m_numVertices;
+   const unsigned char *   m_vertexBase;
+   // Size of a vertex, in bytes
+   int                     m_vertexStride;
+
+   // The index type is set when adding an indexed mesh to the
+   // btTriangleIndexVertexArray, do not set it manually
+   PHY_ScalarType m_indexType;
+
+   // The vertex type has a default type similar to Bullet's precision mode (float or double)
+   // but can be set manually if you for example run Bullet with double precision but have
+   // mesh data in single precision..
+   PHY_ScalarType m_vertexType;
+
+
+   btIndexedMesh()
+	   :m_indexType(PHY_INTEGER),
+#ifdef BT_USE_DOUBLE_PRECISION
+      m_vertexType(PHY_DOUBLE)
+#else // BT_USE_DOUBLE_PRECISION
+      m_vertexType(PHY_FLOAT)
+#endif // BT_USE_DOUBLE_PRECISION
+      {
+      }
+}
+;
+
+
+typedef btAlignedObjectArray<btIndexedMesh>	IndexedMeshArray;
+
+///The btTriangleIndexVertexArray allows to access multiple triangle meshes, by indexing into existing triangle/index arrays.
+///Additional meshes can be added using addIndexedMesh
+///No duplcate is made of the vertex/index data, it only indexes into external vertex/index arrays.
+///So keep those arrays around during the lifetime of this btTriangleIndexVertexArray.
+ATTRIBUTE_ALIGNED16( class) btTriangleIndexVertexArray : public btStridingMeshInterface
+{
+protected:
+	IndexedMeshArray	m_indexedMeshes;
+	int m_pad[2];
+	mutable int m_hasAabb; // using int instead of bool to maintain alignment
+	mutable btVector3 m_aabbMin;
+	mutable btVector3 m_aabbMax;
+
+public:
+
+	BT_DECLARE_ALIGNED_ALLOCATOR();
+
+	btTriangleIndexVertexArray() : m_hasAabb(0)
+	{
+	}
+
+	virtual ~btTriangleIndexVertexArray();
+
+	//just to be backwards compatible
+	btTriangleIndexVertexArray(int numTriangles,int* triangleIndexBase,int triangleIndexStride,int numVertices,btScalar* vertexBase,int vertexStride);
+	
+	void	addIndexedMesh(const btIndexedMesh& mesh, PHY_ScalarType indexType = PHY_INTEGER)
+	{
+		m_indexedMeshes.push_back(mesh);
+		m_indexedMeshes[m_indexedMeshes.size()-1].m_indexType = indexType;
+	}
+	
+	
+	virtual void	getLockedVertexIndexBase(unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0);
+
+	virtual void	getLockedReadOnlyVertexIndexBase(const unsigned char **vertexbase, int& numverts,PHY_ScalarType& type, int& vertexStride,const unsigned char **indexbase,int & indexstride,int& numfaces,PHY_ScalarType& indicestype,int subpart=0) const;
+
+	/// unLockVertexBase finishes the access to a subpart of the triangle mesh
+	/// make a call to unLockVertexBase when the read and write access (using getLockedVertexIndexBase) is finished
+	virtual void	unLockVertexBase(int subpart) {(void)subpart;}
+
+	virtual void	unLockReadOnlyVertexBase(int subpart) const {(void)subpart;}
+
+	/// getNumSubParts returns the number of seperate subparts
+	/// each subpart has a continuous array of vertices and indices
+	virtual int		getNumSubParts() const { 
+		return (int)m_indexedMeshes.size();
+	}
+
+	IndexedMeshArray&	getIndexedMeshArray()
+	{
+		return m_indexedMeshes;
+	}
+
+	const IndexedMeshArray&	getIndexedMeshArray() const
+	{
+		return m_indexedMeshes;
+	}
+
+	virtual void	preallocateVertices(int numverts){(void) numverts;}
+	virtual void	preallocateIndices(int numindices){(void) numindices;}
+
+	virtual bool	hasPremadeAabb() const;
+	virtual void	setPremadeAabb(const btVector3& aabbMin, const btVector3& aabbMax ) const;
+	virtual void	getPremadeAabb(btVector3* aabbMin, btVector3* aabbMax ) const;
+
+}
+;
+
+#endif //BT_TRIANGLE_INDEX_VERTEX_ARRAY_H
diff --git a/opencl/gpu_sat/kernels/bvhTraversal.cl b/opencl/gpu_sat/kernels/bvhTraversal.cl
new file mode 100644
index 000000000..aad0341c4
--- /dev/null
+++ b/opencl/gpu_sat/kernels/bvhTraversal.cl
@@ -0,0 +1,109 @@
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+
+#define SHAPE_CONVEX_HULL 3
+#define SHAPE_CONCAVE_TRIMESH 5
+#define TRIANGLE_NUM_CONVEX_FACES 5
+#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6
+
+typedef unsigned int u32;
+
+///keep this in sync with btCollidable.h
+typedef struct
+{
+	int m_numChildShapes;
+	int blaat2;
+	int m_shapeType;
+	int m_shapeIndex;
+	
+} btCollidableGpu;
+
+typedef struct
+{
+	float4	m_childPosition;
+	float4	m_childOrientation;
+	int m_shapeIndex;
+	int m_unused0;
+	int m_unused1;
+	int m_unused2;
+} btGpuChildShape;
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_collidableIdx;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} BodyData;
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+// work-in-progress
+__kernel void   bvhTraversalKernel( __global const int2* pairs, 
+																					__global const BodyData* rigidBodies, 
+																					__global const btCollidableGpu* collidables,
+																					__global btAabbCL* aabbs,
+																					__global int4* concavePairsOut,
+																					__global volatile int* numConcavePairsOut,
+																					int numPairs,
+																					int maxNumConcavePairsCapacity
+																					)
+{
+
+	int i = get_global_id(0);
+	
+	if (i<numPairs)
+	{
+
+	
+		int bodyIndexA = pairs[i].x;
+		int bodyIndexB = pairs[i].y;
+
+		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;
+		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;
+	
+		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;
+		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;
+		
+		
+		//once the broadphase avoids static-static pairs, we can remove this test
+		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))
+		{
+			return;
+		}
+		
+		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))// && (collidables[collidableIndexB].m_shapeType==SHAPE_CONVEX_HULL))
+		{
+			int pairIdx = atomic_inc(numConcavePairsOut);
+			if (pairIdx<maxNumConcavePairsCapacity)
+			{
+				//int4 newPair;
+				concavePairsOut[pairIdx].x = bodyIndexA;
+				concavePairsOut[pairIdx].y = bodyIndexB;
+				concavePairsOut[pairIdx].z = 5;
+				concavePairsOut[pairIdx].w = 3;
+			}
+		}//SHAPE_CONCAVE_TRIMESH
+		
+	}//i<numpairs
+}
\ No newline at end of file
diff --git a/opencl/gpu_sat/kernels/bvhTraversal.h b/opencl/gpu_sat/kernels/bvhTraversal.h
new file mode 100644
index 000000000..285fa9467
--- /dev/null
+++ b/opencl/gpu_sat/kernels/bvhTraversal.h
@@ -0,0 +1,112 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* bvhTraversalKernelCL= \
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"\n"
+"#define SHAPE_CONVEX_HULL 3\n"
+"#define SHAPE_CONCAVE_TRIMESH 5\n"
+"#define TRIANGLE_NUM_CONVEX_FACES 5\n"
+"#define SHAPE_COMPOUND_OF_CONVEX_HULLS 6\n"
+"\n"
+"typedef unsigned int u32;\n"
+"\n"
+"///keep this in sync with btCollidable.h\n"
+"typedef struct\n"
+"{\n"
+"	int m_numChildShapes;\n"
+"	int blaat2;\n"
+"	int m_shapeType;\n"
+"	int m_shapeIndex;\n"
+"	\n"
+"} btCollidableGpu;\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4	m_childPosition;\n"
+"	float4	m_childOrientation;\n"
+"	int m_shapeIndex;\n"
+"	int m_unused0;\n"
+"	int m_unused1;\n"
+"	int m_unused2;\n"
+"} btGpuChildShape;\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	float4 m_pos;\n"
+"	float4 m_quat;\n"
+"	float4 m_linVel;\n"
+"	float4 m_angVel;\n"
+"\n"
+"	u32 m_collidableIdx;\n"
+"	float m_invMass;\n"
+"	float m_restituitionCoeff;\n"
+"	float m_frictionCoeff;\n"
+"} BodyData;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"\n"
+"// work-in-progress\n"
+"__kernel void   bvhTraversalKernel( __global const int2* pairs, \n"
+"																					__global const BodyData* rigidBodies, \n"
+"																					__global const btCollidableGpu* collidables,\n"
+"																					__global btAabbCL* aabbs,\n"
+"																					__global int4* concavePairsOut,\n"
+"																					__global volatile int* numConcavePairsOut,\n"
+"																					int numPairs,\n"
+"																					int maxNumConcavePairsCapacity\n"
+"																					)\n"
+"{\n"
+"\n"
+"	int i = get_global_id(0);\n"
+"	\n"
+"	if (i<numPairs)\n"
+"	{\n"
+"\n"
+"	\n"
+"		int bodyIndexA = pairs[i].x;\n"
+"		int bodyIndexB = pairs[i].y;\n"
+"\n"
+"		int collidableIndexA = rigidBodies[bodyIndexA].m_collidableIdx;\n"
+"		int collidableIndexB = rigidBodies[bodyIndexB].m_collidableIdx;\n"
+"	\n"
+"		int shapeIndexA = collidables[collidableIndexA].m_shapeIndex;\n"
+"		int shapeIndexB = collidables[collidableIndexB].m_shapeIndex;\n"
+"		\n"
+"		\n"
+"		//once the broadphase avoids static-static pairs, we can remove this test\n"
+"		if ((rigidBodies[bodyIndexA].m_invMass==0) &&(rigidBodies[bodyIndexB].m_invMass==0))\n"
+"		{\n"
+"			return;\n"
+"		}\n"
+"		\n"
+"		if ((collidables[collidableIndexA].m_shapeType==SHAPE_CONCAVE_TRIMESH))// && (collidables[collidableIndexB].m_shapeType==SHAPE_CONVEX_HULL))\n"
+"		{\n"
+"			int pairIdx = atomic_inc(numConcavePairsOut);\n"
+"			if (pairIdx<maxNumConcavePairsCapacity)\n"
+"			{\n"
+"				//int4 newPair;\n"
+"				concavePairsOut[pairIdx].x = bodyIndexA;\n"
+"				concavePairsOut[pairIdx].y = bodyIndexB;\n"
+"				concavePairsOut[pairIdx].z = 5;\n"
+"				concavePairsOut[pairIdx].w = 3;\n"
+"			}\n"
+"		}//SHAPE_CONCAVE_TRIMESH\n"
+"		\n"
+"	}//i<numpairs\n"
+"}\n"
+;
diff --git a/opencl/gpu_sat/kernels/sat.cl b/opencl/gpu_sat/kernels/sat.cl
index 3eb1f8faf..431834421 100644
--- a/opencl/gpu_sat/kernels/sat.cl
+++ b/opencl/gpu_sat/kernels/sat.cl
@@ -1,4 +1,7 @@
-//keep this enum in sync with the CPU version (in AdlCollisionShape.h)
+//keep this enum in sync with the CPU version (in btCollidable.h)
+//written by Erwin Coumans
+
+
 #define SHAPE_CONVEX_HULL 3
 #define SHAPE_CONCAVE_TRIMESH 5
 #define TRIANGLE_NUM_CONVEX_FACES 5
diff --git a/opencl/gpu_sat/kernels/satKernels.h b/opencl/gpu_sat/kernels/satKernels.h
index b40a359b4..eafeeabc4 100644
--- a/opencl/gpu_sat/kernels/satKernels.h
+++ b/opencl/gpu_sat/kernels/satKernels.h
@@ -1,6 +1,9 @@
 //this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
 static const char* satKernelsCL= \
-"//keep this enum in sync with the CPU version (in AdlCollisionShape.h)\n"
+"//keep this enum in sync with the CPU version (in btCollidable.h)\n"
+"//written by Erwin Coumans\n"
+"\n"
+"\n"
 "#define SHAPE_CONVEX_HULL 3\n"
 "#define SHAPE_CONCAVE_TRIMESH 5\n"
 "#define TRIANGLE_NUM_CONVEX_FACES 5\n"