Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/bin/glut32.dll
+++ b/Extras/RigidBodyGpuPipeline/bin/glut32.dll
--- a/Extras/RigidBodyGpuPipeline/bin/glut64.dll
+++ b/Extras/RigidBodyGpuPipeline/bin/glut64.dll
--- a/Extras/RigidBodyGpuPipeline/build/findDirectX11.lua
+++ b/Extras/RigidBodyGpuPipeline/build/findDirectX11.lua
@@ -0,0 +1,36 @@
+function findDirectX11()
+		local dx11path = os.getenv("DXSDK_DIR")
+		if (dx11path) then
+			local filepath = string.format("%s%s",dx11path,"Include/D3D11.h")
+			headerdx11 = io.open(filepath, "r")
+			if (headerdx11) then
+				 printf("Found DX11: '%s'", filepath)
+				return true
+			end
+		end
+		return false
+	end
+
+function initDirectX11()
+	configuration {}
+	
+	local dx11path = os.getenv("DXSDK_DIR")
+			defines { "ADL_ENABLE_DX11"}
+			includedirs {"$(DXSDK_DIR)/include"}
+	
+		configuration "x32"
+			libdirs {"$(DXSDK_DIR)/Lib/x86"}
+		configuration "x64"
+			libdirs {"$(DXSDK_DIR)/Lib/x64"}
+		configuration {}
+		links {"d3dcompiler",
+					"dxerr",
+					"dxguid",
+					"d3dx9",
+					"d3d9",
+					"winmm",
+					"comctl32",
+					"d3dx11"
+		}
+		return true
+end
--- a/Extras/RigidBodyGpuPipeline/build/findOpenCL.lua
+++ b/Extras/RigidBodyGpuPipeline/build/findOpenCL.lua
@@ -0,0 +1,84 @@
+	-- todo: add Apple OpenCL environment vars
+
+	function findOpenCL_AMD()
+		local amdopenclpath = os.getenv("AMDAPPSDKROOT")
+		if (amdopenclpath) then
+			return true
+		end
+		return false
+	end
+
+	function findOpenCL_NVIDIA()
+		local nvidiaopenclpath = os.getenv("CUDA_PATH")
+		if (nvidiaopenclpath) then
+			return true
+		end
+		return false
+	end
+
+	function findOpenCL_Intel()
+		local intelopenclpath = os.getenv("INTELOCLSDKROOT")
+		if (intelopenclpath) then
+			return true
+		end
+		return false
+	end
+			
+	function initOpenCL_AMD()
+		configuration {}
+		local amdopenclpath = os.getenv("AMDAPPSDKROOT")
+		if (amdopenclpath) then
+			defines { "ADL_ENABLE_CL" , "CL_PLATFORM_AMD"}
+			includedirs {
+				"$(AMDAPPSDKROOT)/include"				
+			}
+			configuration "x32"
+				libdirs {"$(AMDAPPSDKROOT)/lib/x86"}
+			configuration "x64"
+				libdirs {"$(AMDAPPSDKROOT)/lib/x86_64"}
+			configuration {}
+			links {"OpenCL"}
+			return true
+		end
+		return false
+	end
+
+
+	function initOpenCL_NVIDIA()
+		configuration {}
+		local nvidiaopenclpath = os.getenv("CUDA_PATH")
+		if (nvidiaopenclpath) then
+			defines { "ADL_ENABLE_CL" , "CL_PLATFORM_NVIDIA"}
+			includedirs {
+				"$(CUDA_PATH)/include"				
+			}
+			configuration "x32"
+				libdirs {"$(CUDA_PATH)/lib/Win32"}
+			configuration "x64"
+				libdirs {"$(CUDA_PATH)/lib/x64"}
+			configuration {}
+			links {"OpenCL"}
+			return true
+		end
+		return false
+	end
+
+	function initOpenCL_Intel()
+		configuration {}
+		local intelopenclpath = os.getenv("INTELOCLSDKROOT")
+		if (intelopenclpath) then
+			defines { "ADL_ENABLE_CL" , "CL_PLATFORM_INTEL"}
+			includedirs {
+				"$(INTELOCLSDKROOT)/include"				
+			}
+			configuration "x32"
+				libdirs {"$(INTELOCLSDKROOT)/lib/x86"}
+			configuration "x64"
+				libdirs {"$(INTELOCLSDKROOT)/lib/x64"}
+			configuration {}
+			links {"OpenCL"}
+			return true
+		end
+		return false
+	end
+	
--- a/Extras/RigidBodyGpuPipeline/build/findOpenGLGlewGlut.lua
+++ b/Extras/RigidBodyGpuPipeline/build/findOpenGLGlewGlut.lua
@@ -0,0 +1,52 @@
+	-- todo: add Apple OpenCL environment vars
+
+	function initOpenGL()
+		configuration {}
+		configuration {"Windows"}
+			links {"opengl32"}
+		configuration {"MacOSX"}
+ 			links { "Carbon.framework","OpenGL.framework","AGL.framework"} 
+		configuration {"not Windows", "not MacOSX"}
+			links {"GL","GLU"}
+		configuration{}
+	end
+
+	function initGlut()
+		configuration {}
+		configuration {"Windows"}
+
+			includedirs {
+				projectRootDir .. "../../Glut"
+			}
+			libdirs { projectRootDir .. "../../Glut"}
+		configuration {"Windows", "x32"}
+			links {"glut32"}
+		configuration {"Windows", "x64"}
+			links {"glut64"}
+	
+		configuration {"MacOSX"}
+ 			links { "Glut.framework" } 
+	
+		configuration {"not Windows", "not MacOSX"}
+			links {"glut"}
+		configuration{}
+	end
+
+	function initGlew()
+		configuration {}
+		configuration {"Windows"}
+			defines { "GLEW_STATIC"}
+			includedirs {
+					projectRootDir .. "../../Glut"
+			}
+			libdirs {	projectRootDir .. "../../Glut"}
+		configuration {"Windows", "x32"}
+			links {"glew32s"}
+		configuration {"Windows", "x64"}
+			links {"glew64s"}
+
+		configuration{}
+	end
+
+
+
--- a/Extras/RigidBodyGpuPipeline/build/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/build/premake4.lua
@@ -0,0 +1,55 @@
+solution "0MySolution"
+
+	-- Multithreaded compiling
+	if _ACTION == "vs2010" then
+		buildoptions { "/MP"  }
+	end 
+	
+
+  
+	configurations {"Release", "Debug"}
+	configuration "Release"
+		flags { "Optimize", "StaticRuntime", "NoMinimalRebuild", "FloatFast"}
+	configuration "Debug"
+		flags { "Symbols", "StaticRuntime" , "NoMinimalRebuild", "NoEditAndContinue" ,"FloatFast"}
+		
+	platforms {"x32", "x64"}
+
+	configuration "x64"		
+		targetsuffix "_64"
+	configuration {"x64", "debug"}
+		targetsuffix "_x64_debug"
+	configuration {"x64", "release"}
+		targetsuffix "_x64"
+	configuration {"x32", "debug"}
+		targetsuffix "_debug"
+
+	configuration{}
+
+		flags { "NoRTTI", "NoExceptions"}
+		defines { "_HAS_EXCEPTIONS=0" }
+		targetdir "../bin"
+	  location("./" .. _ACTION)
+
+
+	projectRootDir = os.getcwd() .. "/../"
+	print("Project root directroy: " .. projectRootDir);
+
+	dofile ("findOpenCL.lua")
+	dofile ("findDirectX11.lua")
+	dofile ("findOpenGLGlewGlut.lua")
+	
+	language "C++"
+	
+	include "../opencl/gpu_rigidbody_pipeline2"
+	include "../opencl/gpu_rigidbody_pipeline"
+		
+	include "../opencl/basic_initialize"
+	include "../opencl/vector_add"
+	
+	include "../opencl/primitives/AdlTest"
+	include "../opencl/primitives/benchmark"
+	include "../opencl/3dGridBroadphase"
+	include "../opencl/broadphase_benchmark"
+
+	
--- a/Extras/RigidBodyGpuPipeline/build/vs2008.bat
+++ b/Extras/RigidBodyGpuPipeline/build/vs2008.bat
@@ -0,0 +1,10 @@
+
+rem premake4 --no-pelibs vs2008
+rem premake4 --no-pedemos vs2008
+rem premake4 --no-bulletlibs --no-pelibs vs2008
+rem premake4 --with-nacl vs2008
+
+..\..\..\msvc\premake4 vs2008
+mkdir vs2008\cache
+
+pause
--- a/Extras/RigidBodyGpuPipeline/build/vs2010.bat
+++ b/Extras/RigidBodyGpuPipeline/build/vs2010.bat
@@ -0,0 +1,5 @@
+
+..\..\..\msvc\premake4 vs2010
+
+mkdir vs2010\cache
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/AMD/premake4.lua
@@ -0,0 +1,45 @@
+if os.is("Windows") then
+	
+		hasCL = findOpenCL_AMD()
+	
+		if (hasCL) then
+	
+		project "basic_bullet2_demo_AMD"
+
+		initOpenCL_AMD()
+				
+		language "C++"
+		
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+  		includedirs {
+                "..",
+                "../../../bullet2",
+                "../../testbed",
+                "../../../rendering/Gwen",
+                "../../../opencl/basic_initialize",
+                "../../../opencl/primitives"
+                }
+		
+
+		links { "testbed",
+			"bullet2",
+			"gwen"
+		}
+		
+	
+		initOpenGL()
+		initGlut()
+
+	
+		files {
+		"../**.cpp",
+		"../**.h",
+		"../../../opencl/basic_initialize/btOpenCLUtils.cpp",
+		"../../../opencl/basic_initialize/btOpenCLUtils.h"
+		}
+
+	end
+	
+end
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.cpp
@@ -0,0 +1,538 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BasicDemo.h"
+#include "GlutStuff.h"
+///btBulletDynamicsCommon.h is the main Bullet include file, contains most common include files.
+#include "btBulletDynamicsCommon.h"
+#include "CustomConvexShape.h"
+#include "CustomConvexPairCollision.h"
+#include "CustomCollisionDispatcher.h"
+
+#include "ConvexHeightFieldShape.h"
+#include "GLDebugDrawer.h"
+static GLDebugDrawer sDebugDraw;
+
+#include <stdio.h> //printf debugging
+
+#ifdef CL_PLATFORM_AMD
+#include "../../opencl/basic_initialize/btOpenCLUtils.h"
+
+cl_context			g_cxMainContext=0;
+cl_command_queue	g_cqCommandQue=0;
+cl_device_id		g_clDevice=0;
+#endif
+
+///create 125 (5x5x5) dynamic object
+#define ARRAY_SIZE_X 6
+#define ARRAY_SIZE_Y 6
+#define ARRAY_SIZE_Z 4
+
+//maximum number of objects (and allow user to shoot additional boxes)
+#define MAX_PROXIES (ARRAY_SIZE_X*ARRAY_SIZE_Y*ARRAY_SIZE_Z + 1024)
+
+///scaling of the objects (0.1 = 20 centimeter boxes )
+#define SCALING 1.
+#define START_POS_X 0
+#define START_POS_Y -0.8
+#define START_POS_Z 0
+
+#define BoxVtxCount 8
+
+static float BoxVtx[] = {
+-0.5,-0.5,-0.5,
+-0.5,-0.5,0.5,
+-0.5,0.5,-0.5,
+-0.5,0.5,0.5,
+0.5,-0.5,-0.5,
+0.5,-0.5,0.5,
+0.5,0.5,-0.5,
+0.5,0.5,0.5,
+};
+
+static float BoxVtx2[] = {
+-20.3,-10.3,-20.3,
+-20.3,-10.3,20.3,
+-20.3,10.3,-20.3,
+-20.3,10.3,20.3,
+20.3,-10.3,-20.3,
+20.3,-10.3,20.3,
+20.3,10.3,-20.3,
+20.3,10.3,20.3,
+};
+
+
+#define BarrelVtxCount2 57
+
+static float BarrelVtx2[] = {
+0.0f,-0.5f,0.0f,				0.0f,-1.0f,0.0f,
+0.282362f,-0.5f,-0.205148f,     0.0f,-1.0f,0.0f,
+0.349018f,-0.5f,0.0f,           0.0f,-1.0f,0.0f,
+0.107853f,-0.5f,-0.331936f,     0.0f,-1.0f,0.0f,
+-0.107853f,-0.5f,-0.331936f,    0.0f,-1.0f,0.0f,
+0.107853f,-0.5f,-0.331936f,     0.0f,-1.0f,0.0f,
+-0.282362f,-0.5f,-0.205148f,    0.0f,-1.0f,0.0f,
+-0.349018f,-0.5f,0.0f,          0.0f,-1.0f,0.0f,
+-0.282362f,-0.5f,0.205148f,     0.0f,-1.0f,0.0f,
+-0.107853f,-0.5f,0.331936f,     0.0f,-1.0f,0.0f,
+0.107853f,-0.5f,0.331936f,      0.0f,-1.0f,0.0f,
+0.282362f,-0.5f,0.205148f,      0.0f,-1.0f,0.0f,
+0.0f,0.5f,0.0f,                 0.0f,1.0f,0.0f,
+0.349018f,0.5f,0.0f,            0.0f,1.0f,0.0f,
+0.282362f,0.5f,-0.205148f,      0.0f,1.0f,0.0f,
+0.107853f,0.5f,-0.331936f,      0.0f,1.0f,0.0f,
+0.107853f,0.5f,-0.331936f,      0.0f,1.0f,0.0f,
+-0.107853f,0.5f,-0.331936f,     0.0f,1.0f,0.0f,
+-0.282362f,0.5f,-0.205148f,     0.0f,1.0f,0.0f,
+-0.349018f,0.5f,0.0f,           0.0f,1.0f,0.0f,
+-0.282362f,0.5f,0.205148f,      0.0f,1.0f,0.0f,
+-0.107853f,0.5f,0.331936f,      0.0f,1.0f,0.0f,
+0.107853f,0.5f,0.331936f,       0.0f,1.0f,0.0f,
+0.282362f,0.5f,0.205148f,       0.0f,1.0f,0.0f,
+0.349018f,-0.5f,0.0f,           0.957307f,-0.289072f,0.0f,
+0.404509f,0.0f,-0.293893f,      0.809017f,0.0f,-0.587785f,
+0.5f,0.0f,0.0f,                 1.0f,0.0f,0.0f,
+0.282362f,-0.5f,-0.205148f,     0.774478f,-0.289072f,-0.562691f,
+0.154508f,0.0f,-0.475528f,      0.309017f,0.0f,-0.951057f,
+0.107853f,-0.5f,-0.331936f,     0.295824f,-0.289072f,-0.910453f,
+0.107853f,-0.5f,-0.331936f,     0.295824f,-0.289072f,-0.910453f,
+-0.154509f,0.0f,-0.475528f,     -0.309017f,0.0f,-0.951057f,
+0.154508f,0.0f,-0.475528f,      0.309017f,0.0f,-0.951057f,
+-0.107853f,-0.5f,-0.331936f,    -0.295824f,-0.289072f,-0.910453f,
+-0.404509f,0.0f,-0.293893f,     -0.809017f,0.0f,-0.587785f,
+-0.282362f,-0.5f,-0.205148f,    -0.774478f,-0.289072f,-0.562691f,
+-0.5f,0.0f,0.0f,                -1.0f,0.0f,0.0f,
+-0.349018f,-0.5f,0.0f,          -0.957307f,-0.289072f,0.0f,
+-0.404508f,0.0f,0.293893f,      -0.809017f,0.0f,0.587785f,
+-0.282362f,-0.5f,0.205148f,     -0.774478f,-0.289072f,0.562691f,
+-0.154509f,0.0f,0.475528f,      -0.309017f,0.0f,0.951056f,
+-0.107853f,-0.5f,0.331936f,     -0.295824f,-0.289072f,0.910453f,
+0.154509f,0.0f,0.475528f,       0.309017f,0.0f,0.951056f,
+0.107853f,-0.5f,0.331936f,      0.295824f,-0.289072f,0.910453f,
+0.404509f,0.0f,0.293892f,       0.809017f,0.0f,0.587785f,
+0.282362f,-0.5f,0.205148f,      0.774478f,-0.289072f,0.562691f,
+0.282362f,0.5f,-0.205148f,      0.774478f,0.289072f,-0.562691f,
+0.349018f,0.5f,0.0f,            0.957307f,0.289072f,0.0f,
+0.107853f,0.5f,-0.331936f,      0.295824f,0.289072f,-0.910453f,
+-0.107853f,0.5f,-0.331936f,     -0.295824f,0.289072f,-0.910453f,
+0.107853f,0.5f,-0.331936f,      0.295824f,0.289072f,-0.910453f,
+-0.282362f,0.5f,-0.205148f,     -0.774478f,0.289072f,-0.562691f,
+-0.349018f,0.5f,0.0f,           -0.957307f,0.289072f,0.0f,
+-0.282362f,0.5f,0.205148f,      -0.774478f,0.289072f,0.562691f,
+-0.107853f,0.5f,0.331936f,      -0.295824f,0.289072f,0.910453f,
+0.107853f,0.5f,0.331936f,       0.295824f,0.289072f,0.910453f,
+0.282362f,0.5f,0.205148f,       0.774478f,0.289072f,0.562691f,
+};
+
+
+static int BarrelIdx[] = {
+0,1,2,
+0,3,1,
+0,4,5,
+0,6,4,
+0,7,6,
+0,8,7,
+0,9,8,
+0,10,9,
+0,11,10,
+0,2,11,
+12,13,14,
+12,14,15,
+12,16,17,
+12,17,18,
+12,18,19,
+12,19,20,
+12,20,21,
+12,21,22,
+12,22,23,
+12,23,13,
+24,25,26,
+24,27,25,
+27,28,25,
+27,29,28,
+30,31,32,
+30,33,31,
+33,34,31,
+33,35,34,
+35,36,34,
+35,37,36,
+37,38,36,
+37,39,38,
+39,40,38,
+39,41,40,
+41,42,40,
+41,43,42,
+43,44,42,
+43,45,44,
+45,26,44,
+45,24,26,
+26,46,47,
+26,25,46,
+25,48,46,
+25,28,48,
+32,49,50,
+32,31,49,
+31,51,49,
+31,34,51,
+34,52,51,
+34,36,52,
+36,53,52,
+36,38,53,
+38,54,53,
+38,40,54,
+40,55,54,
+40,42,55,
+42,56,55,
+42,44,56,
+44,47,56,
+44,26,47,
+};
+
+
+__inline void glVertexFloat4( const float4& v )
+{
+	glVertex3f( v.x, v.y, v.z );
+}
+
+__inline void drawPointListTransformed(const float4* vtx,  int nVtx, const float4& translation, const Quaternion& quat)
+{
+	glPushMatrix();
+
+	Matrix3x3 rotMat = mtTranspose( qtGetRotationMatrix( quat ) );
+	float transformMat[16] =
+	{
+		rotMat.m_row[0].x, rotMat.m_row[0].y, rotMat.m_row[0].z, 0,
+		rotMat.m_row[1].x, rotMat.m_row[1].y, rotMat.m_row[1].z, 0,
+		rotMat.m_row[2].x, rotMat.m_row[2].y, rotMat.m_row[2].z, 0,
+		translation.x, translation.y, translation.z,1
+	};
+
+	glMultMatrixf( transformMat );
+
+	float4 c = make_float4(1,1,0,0);
+
+	glPointSize(3.f);
+	glBegin(GL_POINTS);
+	for(int i=0; i<nVtx; i++)
+	{
+		glColor4f(c.x, c.y, c.z, 1);
+		glVertexFloat4( vtx[i] );
+	}
+	glEnd();
+
+	glPopMatrix();
+}
+void displaySamples(const float4* vertices, int numVertices, const float4& translation, const Quaternion& quaternion) 
+{
+	drawPointListTransformed( vertices,numVertices, translation, quaternion );
+}
+
+
+
+void BasicDemo::renderSurfacePoints()
+{
+	if (m_dynamicsWorld->getDebugDrawer()->getDebugMode()& btIDebugDraw::DBG_DrawContactPoints)
+	for (int i=0;i<m_dynamicsWorld->getCollisionObjectArray().size();i++)
+	{
+		btCollisionObject* ob = m_dynamicsWorld->getCollisionObjectArray()[i];
+		if (ob->getCollisionShape()->getShapeType() == CUSTOM_POLYHEDRAL_SHAPE_TYPE)
+		{
+			CustomConvexShape* customConvex = (CustomConvexShape*)ob->getCollisionShape();
+			ConvexHeightField* cvxShape= customConvex->m_ConvexHeightField;
+			if (!cvxShape)
+			{
+				printf("aargh\n");
+			}
+
+				float4 bodyApos;
+			Quaternion bodyAquat;
+
+	
+	const btVector3& pA = ob->getWorldTransform().getOrigin();
+	btQuaternion qA = ob->getWorldTransform().getRotation();
+	
+	bodyApos.x = pA.getX();
+	bodyApos.y = pA.getY();
+	bodyApos.z = pA.getZ();
+	bodyApos.w = 0.f;
+	bodyAquat.x = qA.getX();
+	bodyAquat.y = qA.getY();
+	bodyAquat.z = qA.getZ();
+	bodyAquat.w = qA.getW();
+
+
+	displaySamples(cvxShape->getSamplePoints(),cvxShape->getNumSamplePoints(),bodyApos,bodyAquat);
+
+		}
+
+	}
+}
+void BasicDemo::clientMoveAndDisplay()
+{
+	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); 
+
+	//simple dynamics world doesn't handle fixed-time-stepping
+	float ms = getDeltaTimeMicroseconds();
+	
+	///step the simulation
+	if (m_dynamicsWorld)
+	{
+		m_dynamicsWorld->stepSimulation(ms / 1000000.f);
+		//optional but useful: debug drawing
+		m_dynamicsWorld->debugDrawWorld();
+	}
+		
+	renderme(); 
+
+	renderSurfacePoints();
+
+
+	glFlush();
+
+	swapBuffers();
+
+}
+
+
+
+void BasicDemo::displayCallback(void) {
+
+	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); 
+	
+	renderme();
+
+	renderSurfacePoints();
+
+	//optional but useful: debug drawing to detect problems
+	if (m_dynamicsWorld)
+		m_dynamicsWorld->debugDrawWorld();
+
+	glFlush();
+	swapBuffers();
+}
+
+
+
+
+
+void	BasicDemo::initPhysics()
+{
+	setTexturing(true);
+	setShadows(true);
+
+	m_acceleratedRigidBodies = 0;
+
+	setCameraDistance(btScalar(SCALING*20.));
+
+	///collision configuration contains default setup for memory, collision setup
+	m_collisionConfiguration = new btDefaultCollisionConfiguration();
+	//m_collisionConfiguration->setConvexConvexMultipointIterations();
+
+	///use the default collision dispatcher. For parallel processing you can use a diffent dispatcher (see Extras/BulletMultiThreaded)
+	m_dispatcher = new	btCollisionDispatcher(m_collisionConfiguration);
+
+	
+#ifdef CL_PLATFORM_AMD
+	m_dispatcher = new	CustomCollisionDispatcher(m_collisionConfiguration,	g_cxMainContext,g_clDevice,g_cqCommandQue);
+#else
+	m_dispatcher = new	CustomCollisionDispatcher(m_collisionConfiguration);
+#endif
+
+	m_dispatcher->registerCollisionCreateFunc(CUSTOM_POLYHEDRAL_SHAPE_TYPE,CUSTOM_POLYHEDRAL_SHAPE_TYPE,new CustomConvexConvexPairCollision::CreateFunc(m_collisionConfiguration->getSimplexSolver(), m_collisionConfiguration->getPenetrationDepthSolver()));
+
+	m_broadphase = new btDbvtBroadphase();
+
+	///the default constraint solver. For parallel processing you can use a different solver (see Extras/BulletMultiThreaded)
+	btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver;
+	m_solver = sol;
+
+	m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration);
+	
+	m_dynamicsWorld->setGravity(btVector3(0,-10,0));
+
+	m_dynamicsWorld->setDebugDrawer(&sDebugDraw);
+
+	///create a few basic rigid bodies
+	//btCollisionShape* groundShape = new btBoxShape(btVector3(btScalar(50.),btScalar(50.),btScalar(50.)));
+#if 1
+	CustomConvexShape* groundShape = new CustomConvexShape(BoxVtx2,BoxVtxCount,3*sizeof(float));
+	//btCollisionShape* groundShape = new btStaticPlaneShape(btVector3(0,1,0),0);
+	
+	m_collisionShapes.push_back(groundShape);
+
+	btTransform groundTransform;
+	groundTransform.setIdentity();
+	groundTransform.setOrigin(btVector3(0,-11,0));
+
+	//We can also use DemoApplication::localCreateRigidBody, but for clarity it is provided here:
+	{
+		btScalar mass(0.);
+
+		//rigidbody is dynamic if and only if mass is non zero, otherwise static
+		bool isDynamic = (mass != 0.f);
+
+		btVector3 localInertia(0,0,0);
+		if (isDynamic)
+			groundShape->calculateLocalInertia(mass,localInertia);
+
+		//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
+		btDefaultMotionState* myMotionState = new btDefaultMotionState(groundTransform);
+		btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,groundShape,localInertia);
+		btRigidBody* body = new btRigidBody(rbInfo);
+
+		//add the body to the dynamics world
+		m_dynamicsWorld->addRigidBody(body);
+	}
+#endif
+
+
+	{
+		//create a few dynamic rigidbodies
+		// Re-using the same collision is better for memory usage and performance
+
+		//btCollisionShape* colShape = new btBoxShape(btVector3(SCALING*1,SCALING*1,SCALING*1));
+		//btCollisionShape* colShape = new btSphereShape(btScalar(1.));
+#define USE_CUSTOM_HEIGHTFIELD_SHAPE 
+#ifdef USE_CUSTOM_HEIGHTFIELD_SHAPE
+	CustomConvexShape* colShape = new CustomConvexShape(BarrelVtx2,BarrelVtxCount2,6*sizeof(float));
+
+	//CustomConvexShape* colShape = new CustomConvexShape(BoxVtx,BoxVtxCount,3*sizeof(float));
+#else
+	btConvexHullShape* colShape = new btConvexHullShape(BarrelVtx2,BarrelVtxCount2,6*sizeof(float));
+		colShape->setLocalScaling(btVector3(0.9,0.9,0.9));
+
+#endif //USE_CUSTOM_HEIGHTFIELD_SHAPE
+	btScalar scale = 0.5f;
+	
+	//btScalar scale = 1.f;
+
+		//next line is already called inside the CustomConvexShape constructor
+		//colShape->initializePolyhedralFeatures();
+
+		m_collisionShapes.push_back(colShape);
+
+		/// Create Dynamic Objects
+		btTransform startTransform;
+		startTransform.setIdentity();
+
+		btScalar	mass(1.f);
+
+		//rigidbody is dynamic if and only if mass is non zero, otherwise static
+		bool isDynamic = (mass != 0.f);
+
+		btVector3 localInertia(0,0,0);
+		if (isDynamic)
+			colShape->calculateLocalInertia(mass,localInertia);
+
+		float start_x = START_POS_X - ARRAY_SIZE_X/2;
+		float start_y = START_POS_Y;
+		float start_z = START_POS_Z - ARRAY_SIZE_Z/2;
+
+		for (int k=0;k<ARRAY_SIZE_Y;k++)
+		{
+			for(int j = 0;j<ARRAY_SIZE_Z;j++)	
+			{
+				for (int i=0;i<ARRAY_SIZE_X;i++)
+				{
+					
+					{
+					//	if ((k>0) && ((j<2) || (j>(ARRAY_SIZE_Z-3))))
+					//		continue;
+					//	if ((k>0) && ((i<2) || (i>(ARRAY_SIZE_X-3))))
+					//		continue;
+
+					startTransform.setOrigin(SCALING*btVector3(
+										btScalar(scale*2.0*i + start_x),
+										btScalar(scale*1+scale*2.0*k + start_y),
+										btScalar(scale*2.0*j + start_z)));
+
+			
+					//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
+					btDefaultMotionState* myMotionState = new btDefaultMotionState(startTransform);
+					btRigidBody* body=0;
+
+					if (0)//k==0)
+					{
+						btVector3 zeroInertia(0,0,0);
+						btRigidBody::btRigidBodyConstructionInfo rbInfo(0.f,myMotionState,colShape,zeroInertia);
+						body = new btRigidBody(rbInfo);
+					} else
+					{
+						btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,colShape,localInertia);
+						body = new btRigidBody(rbInfo);
+					}
+
+					//m_acceleratedRigidBodies is used as a mapping to the accelerated rigid body index
+					body->setCompanionId(m_acceleratedRigidBodies++);
+					m_dynamicsWorld->addRigidBody(body);
+						
+					}
+				}
+			}
+		}
+	}
+
+
+}
+void	BasicDemo::clientResetScene()
+{
+	exitPhysics();
+	initPhysics();
+}
+	
+
+void	BasicDemo::exitPhysics()
+{
+
+	//cleanup in the reverse order of creation/initialization
+
+	//remove the rigidbodies from the dynamics world and delete them
+	int i;
+	for (i=m_dynamicsWorld->getNumCollisionObjects()-1; i>=0 ;i--)
+	{
+		btCollisionObject* obj = m_dynamicsWorld->getCollisionObjectArray()[i];
+		btRigidBody* body = btRigidBody::upcast(obj);
+		if (body && body->getMotionState())
+		{
+			delete body->getMotionState();
+		}
+		m_dynamicsWorld->removeCollisionObject( obj );
+		delete obj;
+	}
+
+	//delete collision shapes
+	for (int j=0;j<m_collisionShapes.size();j++)
+	{
+		btCollisionShape* shape = m_collisionShapes[j];
+		delete shape;
+	}
+	m_collisionShapes.clear();
+
+	delete m_dynamicsWorld;
+	
+	delete m_solver;
+	
+	delete m_broadphase;
+	
+	delete m_dispatcher;
+
+	delete m_collisionConfiguration;
+
+	
+}
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.h
@@ -0,0 +1,86 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef BASIC_DEMO_H
+#define BASIC_DEMO_H
+
+#ifdef _WINDOWS
+#include "Win32DemoApplication.h"
+#define PlatformDemoApplication Win32DemoApplication
+#else
+#include "GlutDemoApplication.h"
+#define PlatformDemoApplication GlutDemoApplication
+#endif
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+class btBroadphaseInterface;
+class btCollisionShape;
+class btOverlappingPairCache;
+class btCollisionDispatcher;
+class btConstraintSolver;
+struct btCollisionAlgorithmCreateFunc;
+class btDefaultCollisionConfiguration;
+
+///BasicDemo is good starting point for learning the code base and porting.
+
+class BasicDemo : public PlatformDemoApplication
+{
+
+	//keep the collision shapes, for deletion/cleanup
+	btAlignedObjectArray<btCollisionShape*>	m_collisionShapes;
+
+	btBroadphaseInterface*	m_broadphase;
+
+	btCollisionDispatcher*	m_dispatcher;
+
+	btConstraintSolver*	m_solver;
+
+	btDefaultCollisionConfiguration* m_collisionConfiguration;
+
+	int m_acceleratedRigidBodies;
+
+	public:
+
+	BasicDemo()
+	{
+	}
+	virtual ~BasicDemo()
+	{
+		exitPhysics();
+	}
+	void	initPhysics();
+
+	void	exitPhysics();
+
+	virtual void clientMoveAndDisplay();
+
+	virtual void displayCallback();
+	virtual void	clientResetScene();
+	
+	static DemoApplication* Create()
+	{
+		BasicDemo* demo = new BasicDemo;
+		demo->myinit();
+		demo->initPhysics();
+		return demo;
+	}
+
+	void renderSurfacePoints();
+
+	
+};
+
+#endif //BASIC_DEMO_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.cpp
@@ -0,0 +1,507 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include "ConvexHeightFieldShape.h"
+#include "Stubs/AdlCollideUtils.h"
+#include "CubeMapUtils.h"
+//#include <common/Physics/ShapeBase.h>
+//#include <common/Physics/SphereShape.h>
+//#include "GlutStuff.h"
+
+//#define USE_OLD
+
+ConvexHeightField::ConvexHeightField(const float4* vtxBuffer, const int4* idxBuffer, int nTriangles)
+: CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD )
+{
+	create( vtxBuffer, idxBuffer, nTriangles );
+}
+
+void ConvexHeightField::create( const float4* vtxBuffer, const int4* idxBuffer, int nTriangles )
+{
+	{
+		float maxDx2 = -1.f;
+		int maxIdx = -1;
+		for(int i=0; i<nTriangles; i++)
+		{
+			const int4& idx = idxBuffer[i];
+			for(int j=0; j<3; j++)
+			{
+				float dx2 = dot3F4( vtxBuffer[idx.s[j]], vtxBuffer[idx.s[j]] );
+				if( dx2 > maxDx2 )
+				{
+					maxDx2 = dx2;
+					maxIdx = idx.s[j];
+				}
+			}
+		}
+		ADLASSERT( maxIdx != -1 );
+		m_scale = sqrtf( maxDx2 );
+	}
+
+	//	cast ray to find intersectPlaneLineions
+	{
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4 v;
+				float x = (i+0.5f)/(float)HEIGHT_RES;
+				float y = (j+0.5f)/(float)HEIGHT_RES;
+				v = CubeMapUtils::calcVector(faceIdx, x, y);
+				v = normalize3( v );
+				v *= m_scale;
+
+				float minFraction = FLT_MAX;
+				float4 minNormal;
+				float4 minBCrd;
+				for(int itri=0; itri<nTriangles; itri++)
+				{
+					float4 from = make_float4(0.f);
+					float4 bCrd;
+					float fraction = CollideUtils::castRay( vtxBuffer[idxBuffer[itri].x], vtxBuffer[idxBuffer[itri].y], vtxBuffer[idxBuffer[itri].z], 
+						from, v, 0.0f, &bCrd );
+
+					if( fraction > 0.f )
+					{
+						minFraction = min2( minFraction, fraction );	//	todo. have to check if this is the min to replace normal?
+						float4 ab = vtxBuffer[idxBuffer[itri].y]-vtxBuffer[idxBuffer[itri].x];
+						float4 ac = vtxBuffer[idxBuffer[itri].z]-vtxBuffer[idxBuffer[itri].x];
+						minNormal = cross3( ab, ac );
+						minBCrd = bCrd;
+					}
+				}
+
+				if( minFraction == FLT_MAX )
+					minFraction = 0.f;
+
+				{
+					u8 quantizedHeight = (u8)(minFraction*255.f);
+					sample( (Face)faceIdx, i,j ) = quantizedHeight;
+					sampleNormal( (Face)faceIdx, i,j ) = normalize3(minNormal);
+					float minValue = 3.f*(1.f/3.f)*(1.f/3.f);
+					sampleNormal( (Face)faceIdx, i,j ).w = (dot3F4( minBCrd, minBCrd ) - minValue )/(1.f-minValue);
+				}
+			}
+		}
+	}
+
+	calcSamplePoints( m_samplePoints );
+
+	//	calc support height using m_samplePoints
+	{
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++) for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+		{
+			float4 v;
+			float x = (i+0.5f)/(float)HEIGHT_RES;
+			float y = (j+0.5f)/(float)HEIGHT_RES;
+			v = CubeMapUtils::calcVector(faceIdx, x, y);
+			v = normalize3( v );
+
+			float maxHeight = -1;
+			for(int ie=0; ie<6*HEIGHT_RES*HEIGHT_RES; ie++)
+			{
+				float h = dot3F4( v, m_samplePoints[ie] )/m_scale;
+				ADLASSERT( h <= 1.f );
+				if( h > maxHeight ) maxHeight = h;
+			}
+
+			{
+				u8 quantizedHeight = min2((u8)(maxHeight*255.f)+1, 255);
+				sampleSupport( (Face)faceIdx, i, j ) = quantizedHeight;
+			}
+		}
+	}
+
+	m_aabb.setEmpty();
+	for(int i=0; i<nTriangles; i++)
+	{
+		const int4& idx = idxBuffer[i];
+		m_aabb.includePoint( vtxBuffer[idx.x] );
+		m_aabb.includePoint( vtxBuffer[idx.y] );
+		m_aabb.includePoint( vtxBuffer[idx.z] );
+	}
+	m_aabb.expandBy( make_float4( m_collisionMargin ) );
+
+	for(int i=0; i<6; i++)
+	{
+		m_faceAabbs[i].setEmpty();
+		for(int j=0; j<HEIGHT_RES*HEIGHT_RES; j++)
+		{
+			float4 p = m_samplePoints[i*HEIGHT_RES*HEIGHT_RES + j];
+			m_faceAabbs[i].includePoint(p);
+		}
+		m_faceAabbs[i].expandBy( make_float4( m_collisionMargin ) );
+	}
+}
+
+static __inline float localIntersectPlaneLine( const float4& planeEqn, const float4& vec, const float4& orig )
+{
+	return (-planeEqn.w - dot3F4(planeEqn, orig))/dot3F4(planeEqn, vec);
+}
+
+
+ConvexHeightField::ConvexHeightField(const float4* eqn, int nEqn)
+	: CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD )
+{
+	{	//	cast ray to find intersectPlaneLineions
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4 v;
+				float x = (i+0.5f)/(float)HEIGHT_RES;
+				float y = (j+0.5f)/(float)HEIGHT_RES;
+				v = CubeMapUtils::calcVector(faceIdx, x, y);
+				v = normalize3( v );
+
+				float minFraction = FLT_MAX;
+				float4 minNormal;
+				for(int ii=0; ii<nEqn; ii++)
+				{
+					const float4& iEqn = eqn[ii];
+
+					float fraction = localIntersectPlaneLine( iEqn, v, make_float4(0.f) );
+
+					if( fraction > 0.f )
+					{
+						if( fraction < minFraction )
+						{
+							minFraction = fraction;
+							minNormal = iEqn;
+						}
+					}
+				}
+
+				ADLASSERT( minFraction != FLT_MAX );
+
+				minNormal.w = minFraction;
+				sampleNormal( (Face)faceIdx, i, j ) = minNormal;
+			}
+		}
+	}
+
+	{
+		m_scale = -FLT_MAX;
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4& n = sampleNormal( (Face)faceIdx, i, j );
+
+				m_scale = max2( m_scale, n.w );
+			}
+		}
+		
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4& n = sampleNormal( (Face)faceIdx, i, j );
+				u8 quantizedHeight = (u8)(n.w/m_scale*255.f);
+				sample( (Face)faceIdx, i, j ) = quantizedHeight;
+			}
+		}
+	}
+
+	calcSamplePoints( m_samplePoints );
+
+	//	calc support height using m_samplePoints
+	{
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++) for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+		{
+			float4 v;
+			float x = (i+0.5f)/(float)HEIGHT_RES;
+			float y = (j+0.5f)/(float)HEIGHT_RES;
+			v = CubeMapUtils::calcVector(faceIdx, x, y);
+			v = normalize3( v );
+
+			float maxHeight = -1;
+			for(int ie=0; ie<6*HEIGHT_RES*HEIGHT_RES; ie++)
+			{
+				float h = dot3F4( v, m_samplePoints[ie] )/m_scale;
+				if (h>1.f)
+					h=1.f;
+//				ADLASSERT( h <= 1.f );
+				if( h > maxHeight ) maxHeight = h;
+			}
+
+			{
+				u8 quantizedHeight = min2((u8)(maxHeight*255.f)+1, 255);
+				sampleSupport( (Face)faceIdx, i, j ) = quantizedHeight;
+			}
+		}
+	}
+
+	for(int i=0; i<6; i++)
+	{
+		m_faceAabbs[i].setEmpty();
+		for(int j=0; j<HEIGHT_RES*HEIGHT_RES; j++)
+		{
+			float4 p = m_samplePoints[i*HEIGHT_RES*HEIGHT_RES + j];
+			m_faceAabbs[i].includePoint(p);
+		}
+		m_faceAabbs[i].expandBy( make_float4( m_collisionMargin ) );
+	}
+
+	m_aabb.setEmpty();
+	for(int i=0; i<6; i++)
+	{
+		m_aabb.includeVolume( m_faceAabbs[i] );
+	}
+}
+
+#if 0
+ConvexHeightField::ConvexHeightField(const ShapeBase* shape)
+	: CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD )
+{
+	if( shape->m_type == ADL_SHAPE_SPHERE )
+	{
+		SphereShape* sphere = (SphereShape*)shape;
+
+		m_scale = sphere->m_radius;
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4 minNormal;
+				float x = (i+0.5f)/(float)HEIGHT_RES;
+				float y = (j+0.5f)/(float)HEIGHT_RES;
+				minNormal = CubeMapUtils::calcVector(faceIdx, x, y);
+				minNormal = normalize3( minNormal );
+				{
+					u8 quantizedHeight = (u8)(1.f*255.f);
+					sample( (Face)faceIdx, i,j ) = quantizedHeight;
+					sampleNormal( (Face)faceIdx, i,j ) = normalize3(minNormal);
+//					float minValue = 3.f*(1.f/3.f)*(1.f/3.f);
+//					sampleNormal( (Face)faceIdx, i,j ).w = (dot3F4( minBCrd, minBCrd ) - minValue )/(1.f-minValue);
+				}
+			}
+		}
+
+		calcSamplePoints( m_samplePoints );
+
+		m_aabb.m_max = make_float4( sphere->m_radius );
+		m_aabb.m_min = make_float4( -sphere->m_radius );
+
+		m_aabb.expandBy( make_float4( m_collisionMargin ) );
+
+		for(int i=0; i<6; i++)
+		{
+			m_faceAabbs[i].setEmpty();
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES; j++)
+			{
+				float4 p = m_samplePoints[i*HEIGHT_RES*HEIGHT_RES + j];
+				m_faceAabbs[i].includePoint(p);
+			}
+			m_faceAabbs[i].expandBy( make_float4( m_collisionMargin ) );
+		}
+	}
+	else
+	{
+		ShapeBase* s = (ShapeBase*)shape;
+
+		create( s->getVertexBuffer(), s->getTriangleBuffer(), s->getNumTris() );
+	}
+}
+#endif
+
+ConvexHeightField::~ConvexHeightField()
+{
+
+}
+
+float ConvexHeightField::queryDistance(const float4& p ) const
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+	if( dot3F4( p, p ) >= m_scale*m_scale ) return FLT_MAX;
+
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	float height;
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+		float dx = x-xi;
+		float dy = y-yi;
+
+		{
+			int xip = min2((int)(HEIGHT_RES-1), xi+1);
+			int yip = min2((int)(HEIGHT_RES-1), yi+1);
+
+			u8 xy = sample( (Face)faceIdx, xi, yi );
+			u8 xpy = sample( (Face)faceIdx, xip, yi );
+			u8 xpyp = sample( (Face)faceIdx, xip, yip );
+			u8 xyp = sample( (Face)faceIdx, xi, yip );
+
+			height = (xy*(1.f-dx)+xpy*dx)*(1.f-dy) + (xyp*(1.f-dx)+xpyp*dx)*dy;
+			height = height/255.f*m_scale;
+
+			height = length3( p ) - height;
+		}
+	}
+
+	return height;
+}
+
+float ConvexHeightField::querySupportHeight(const float4& p ) const
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+//	if( dot3F4( p, p ) >= m_scale*m_scale ) return FLT_MAX;
+
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	float height;
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+		float dx = x-xi;
+		float dy = y-yi;
+
+		{
+			int xip = min2((int)(HEIGHT_RES-1), xi+1);
+			int yip = min2((int)(HEIGHT_RES-1), yi+1);
+
+			u8 xy = sampleSupport( (Face)faceIdx, xi, yi );
+			u8 xpy = sampleSupport( (Face)faceIdx, xip, yi );
+			u8 xpyp = sampleSupport( (Face)faceIdx, xip, yip );
+			u8 xyp = sampleSupport( (Face)faceIdx, xi, yip );
+
+			height = max2( xy, max2( xpy, max2( xpyp, xyp ) ) );
+			height = height/255.f*m_scale;
+		}
+	}
+
+	return height;
+}
+
+float ConvexHeightField::queryW(const float4& p ) const
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+	float value;
+	if( dot3F4( p, p ) >= m_scale*m_scale ) return 0;
+
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+
+		value = sampleNormal( (Face)faceIdx, xi, yi ).w;
+	}
+	return value;
+}
+
+bool ConvexHeightField::queryDistanceWithNormal( const float4& p, float4& normalOut ) const
+{
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+
+		normalOut = sampleNormal( (Face)faceIdx, xi, yi );
+	}
+	return true;
+}
+
+void ConvexHeightField::calcSamplePoints(float4* points) const
+{
+	for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+	{
+		for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+		{
+			float4 v;
+			float x = (i+0.5f)/(float)HEIGHT_RES;
+			float y = (j+0.5f)/(float)HEIGHT_RES;
+			v = CubeMapUtils::calcVector(faceIdx, x, y);
+			v = normalize3( v );
+
+			int quantizedHeight = sample( (Face)faceIdx, i, j );
+			float rheight = quantizedHeight/255.f*m_scale;
+
+			points[ HEIGHT_RES*HEIGHT_RES*faceIdx + i + j*HEIGHT_RES ] = rheight*v;
+		}
+	}
+	return;
+}
+
+float4 ConvexHeightField::calcSamplePoint( int sIdx ) const
+{
+	int idir; int plus;
+	Face faceIdx = (Face)(sIdx/(HEIGHT_RES*HEIGHT_RES));
+	idir = (faceIdx/2);
+	plus = faceIdx & 1;
+
+	float4 viewVector = make_float4((idir==0)?1.f:0.f, (idir==1)?1.f:0.f, (idir==2)?1.f:0.f );
+	if( plus==0 ) viewVector *= -1.f;
+	float4 xVector = make_float4( viewVector.z, viewVector.x, viewVector.y );
+	float4 yVector = make_float4( viewVector.y, viewVector.z, viewVector.x );
+	float4 orig = viewVector-xVector-yVector;
+
+	int pIdx = sIdx%(HEIGHT_RES*HEIGHT_RES);
+	int i = pIdx/HEIGHT_RES;
+	int j = pIdx%HEIGHT_RES;
+
+	float4 v = orig + (i+0.5f)*xVector/(HEIGHT_RES*0.5f) + (j+0.5f)*yVector/(HEIGHT_RES*0.5f);
+	v = normalize3( v );
+
+	int quantizedHeight = sample( faceIdx, i, j );
+	float rheight = quantizedHeight/255.f*m_scale;
+	return rheight*v;
+}
+
+const float4* ConvexHeightField::getSamplePoints() const
+{
+	return m_samplePoints;
+}
+
+int ConvexHeightField::getNumSamplePoints() const
+{
+	return HEIGHT_RES*HEIGHT_RES*6;
+}
+
+__inline
+float4 rainbowMap( float s )
+{
+	float c = 4.f;
+	float r,g,b;
+	r = c*(s-0.75f);
+	g = c*(s-0.5f);
+	b = c*(s-0.25f);
+
+	float4 col = make_float4( 1.f-r*r, 1.f-g*g, 1.f-b*b );
+	return col;
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.h
@@ -0,0 +1,143 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef CONVEX_HEIGHT_FIELD_SHAPE_H
+#define CONVEX_HEIGHT_FIELD_SHAPE_H
+
+#include "Stubs/AdlQuaternion.h"
+#include "Stubs/AdlCollisionShape.h"
+#include "Stubs/AdlAabb.h"
+
+class ShapeBase;
+
+class ConvexHeightField : public CollisionShape
+{
+	public:
+		enum
+		{
+			HEIGHT_RES = 4, //was 4 originally
+		};
+		enum Face
+		{
+			FACE_XM,
+			FACE_XP,
+			FACE_YM,
+			FACE_YP,
+			FACE_ZM,
+			FACE_ZP,
+			NUM_FACES,
+		};
+
+		ConvexHeightField(const float4* vtxBuffer, const int4* idxBuffer, int nTriangles);
+		ConvexHeightField(const ShapeBase* shape);
+		ConvexHeightField(const float4* eqn, int nEqn);
+
+		ConvexHeightField(): CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD ){}
+
+		virtual ~ConvexHeightField();
+
+		//	CollisionShape interface
+		virtual float queryDistance(const float4& p ) const;
+		//	distance is not written to normalOut.w
+		virtual bool queryDistanceWithNormal( const float4& p, float4& normalOut ) const;
+
+		float querySupportHeight(const float4& p ) const;
+
+		//	what is it?
+		float queryW(const float4& p ) const;
+
+		//	others
+		u8& sample(Face face, int x, int y);
+		u8 sample(Face face, int x, int y) const;
+
+		u8& sampleSupport(Face face, int x, int y);
+		u8 sampleSupport(Face face, int x, int y) const;
+
+		float4& sampleNormal(Face face, int x, int y);
+		float4 sampleNormal(Face face, int x, int y) const;
+
+		void calcSamplePoints(float4* points) const;
+		float4 calcSamplePoint(int sIdx) const;
+		const float4* getSamplePoints() const;
+		
+		int getNumSamplePoints() const;
+
+		//void displaySamples(const float4& translation, const Quaternion& quaternion) const;
+
+	private:
+		void create( const float4* vtxBuffer, const int4* idxBuffer, int nTriangles );
+
+	public:
+		u8 m_data[HEIGHT_RES*HEIGHT_RES*6];
+		float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+		float m_scale;
+
+		u8 m_supportHeight[HEIGHT_RES*HEIGHT_RES*6];
+
+		float4 m_samplePoints[HEIGHT_RES*HEIGHT_RES*6];
+		Aabb m_faceAabbs[6];
+};
+
+__inline
+u8& ConvexHeightField::sample(Face face, int x, int y)
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_data[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+u8 ConvexHeightField::sample(Face face, int x, int y) const
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_data[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+u8& ConvexHeightField::sampleSupport(Face face, int x, int y)
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_supportHeight[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+u8 ConvexHeightField::sampleSupport(Face face, int x, int y) const
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_supportHeight[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+float4& ConvexHeightField::sampleNormal(Face face, int x, int y)
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_normal[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+float4 ConvexHeightField::sampleNormal(Face face, int x, int y) const
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_normal[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CubeMapUtils.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CubeMapUtils.h
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+//	Coords are 0.5f shifted. See CubeMapDemo.cpp for usage. 
+class CubeMapUtils
+{
+	public:
+		//enum Face
+		//{
+		//	FACE_XM,
+		//	FACE_XP,
+		//	FACE_YM,
+		//	FACE_YP,
+		//	FACE_ZM,
+		//	FACE_ZP,
+		//	NUM_FACES,
+		//};
+
+		__inline
+		static void calcCrd(const float4& p, int& faceIdxOut, float& x, float& y);
+
+		__inline
+		static float4 calcVector(int faceIdx, float x, float y);
+};
+
+
+__inline
+void CubeMapUtils::calcCrd(const float4& p, int& faceIdxOut, float& x, float& y)
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+	float4 majorAxis;
+
+	{
+		int idx;
+		float r2[] = {p.x*p.x, p.y*p.y, p.z*p.z};
+
+		idx = (r2[1]>r2[0])? 1:0;
+		idx = (r2[2]>r2[idx])? 2:idx;
+		majorAxis = majorAxes[idx];
+
+		bool isNeg = dot3F4( p, majorAxis ) < 0.f;
+
+		faceIdxOut = (idx*2+((isNeg)? 0:1));
+//==
+		float4 abs = make_float4( fabs(p.x), fabs(p.y), fabs(p.z), 0.f );
+
+		float d;
+		if( idx == 0 )
+		{
+			x = p.y;
+			y = p.z;
+			d = abs.x;
+		}
+		else if( idx == 1 )
+		{
+			x = p.z;
+			y = p.x;
+			d = abs.y;
+		}
+		else
+		{
+			x = p.x;
+			y = p.y;
+			d = abs.z;
+		}
+
+		float dInv = (d==0.f)? 0.f: (1.f/d);
+		x = (x*dInv+1.f)*0.5f;
+		y = (y*dInv+1.f)*0.5f;
+	}
+}
+
+__inline
+float4 CubeMapUtils::calcVector(int faceIdx, float x, float y)
+{
+	int dir = faceIdx/2;
+	float z = (faceIdx%2 == 0)? -1.f:1.f;
+
+	x = x*2.f-1.f;
+	y = y*2.f-1.f;
+	
+	if( dir == 0 )
+	{
+		return make_float4(z, x, y);
+	}
+	else if( dir == 1 )
+	{
+		return make_float4(y,z,x);
+	}
+	else
+	{
+		return make_float4(x,y,z);
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.cpp
@@ -0,0 +1,699 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "CustomCollisionDispatcher.h"
+#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "CustomConvexShape.h"
+#include "CustomConvexPairCollision.h"
+#include "LinearMath/btQuickprof.h"
+
+
+
+#ifdef CL_PLATFORM_AMD
+
+#include "Adl/Adl.h"
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlContact4.h"
+#include "Stubs/AdlQuaternion.h"
+#include "Stubs/ChNarrowPhase.h"
+
+#include "Stubs/Solver.h"
+
+
+struct	CustomDispatchData
+{
+	adl::DeviceCL* m_ddcl;
+	adl::Device* m_deviceHost;
+	ShapeDataType m_ShapeBuffer;
+	
+	adl::HostBuffer<int2>* m_pBufPairsCPU;
+	adl::Buffer<int2>* m_pBufPairsGPU;
+	adl::Buffer<Contact4>* m_pBufContactOutGPU;
+	adl::HostBuffer<Contact4>* m_pBufContactOutCPU;
+	adl::ChNarrowphase<adl::TYPE_CL>::Data* m_Data;
+
+	adl::HostBuffer<RigidBodyBase::Body>* m_pBufRBodiesCPU;
+	adl::Buffer<RigidBodyBase::Body>* m_pBufRBodiesGPU;
+
+	adl::Buffer<RigidBodyBase::Shape>*	m_bodyInfoBufferCPU;
+	adl::Buffer<RigidBodyBase::Shape>*	m_bodyInfoBufferGPU;
+
+	adl::Solver<adl::TYPE_CL>::Data* m_solverDataGPU;
+	SolverData		m_contactCGPU;
+	void*			m_frictionCGPU;
+
+	int m_numAcceleratedShapes;
+};
+#endif //CL_PLATFORM_AMD
+
+CustomCollisionDispatcher::CustomCollisionDispatcher(btCollisionConfiguration* collisionConfiguration
+#ifdef CL_PLATFORM_AMD
+		, cl_context context,cl_device_id device,cl_command_queue queue
+#endif //CL_PLATFORM_AMD
+):btCollisionDispatcher(collisionConfiguration),
+m_internalData(0)
+{
+#ifdef CL_PLATFORM_AMD
+
+	if (context && queue)
+	{
+		m_internalData = new CustomDispatchData();
+		memset(m_internalData,0,sizeof(CustomDispatchData));
+
+		adl::DeviceUtils::Config cfg;
+		m_internalData->m_ddcl = new adl::DeviceCL();
+		m_internalData->m_ddcl->m_deviceIdx = device;
+		m_internalData->m_ddcl->m_context = context;
+		m_internalData->m_ddcl->m_commandQueue = queue;
+		m_internalData->m_ddcl->m_kernelManager = new adl::KernelManager;
+
+
+		m_internalData->m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
+		m_internalData->m_pBufPairsCPU = new adl::HostBuffer<int2>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_pBufContactOutCPU = new adl::HostBuffer<Contact4>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_pBufRBodiesCPU = new adl::HostBuffer<RigidBodyBase::Body>(m_internalData->m_deviceHost, MAX_CONVEX_BODIES_CL);
+		
+		m_internalData->m_bodyInfoBufferCPU = new adl::Buffer<RigidBodyBase::Shape>(m_internalData->m_deviceHost,MAX_CONVEX_BODIES_CL);
+		m_internalData->m_pBufContactOutGPU = new adl::Buffer<Contact4>(m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_bodyInfoBufferGPU = new adl::Buffer<RigidBodyBase::Shape>(m_internalData->m_ddcl,MAX_CONVEX_BODIES_CL);
+		m_internalData->m_pBufPairsGPU = new adl::Buffer<int2>(m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate( m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_pBufRBodiesGPU = new adl::Buffer<RigidBodyBase::Body>(m_internalData->m_ddcl, MAX_CONVEX_BODIES_CL);
+		m_internalData->m_Data = adl::ChNarrowphase<adl::TYPE_CL>::allocate(m_internalData->m_ddcl);
+		m_internalData->m_ShapeBuffer = adl::ChNarrowphase<adl::TYPE_CL>::allocateShapeBuffer(m_internalData->m_ddcl, MAX_CONVEX_SHAPES_CL);	
+		m_internalData->m_numAcceleratedShapes = 0;
+
+		m_internalData->m_contactCGPU = adl::Solver<adl::TYPE_CL>::allocateConstraint4( m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_frictionCGPU = adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+
+	}
+
+
+
+#endif //CL_PLATFORM_AMD
+}
+
+CustomCollisionDispatcher::~CustomCollisionDispatcher(void)
+{
+#ifdef CL_PLATFORM_AMD
+	if (m_internalData)
+	{
+		delete m_internalData->m_pBufPairsCPU;
+		delete m_internalData->m_pBufPairsGPU;
+		delete m_internalData->m_pBufContactOutGPU;
+		delete m_internalData->m_pBufContactOutCPU;
+
+		adl::Solver<adl::TYPE_CL>::deallocateConstraint4( m_internalData->m_contactCGPU );
+		adl::Solver<adl::TYPE_CL>::deallocateFrictionConstraint( m_internalData->m_frictionCGPU );
+
+
+		adl::Solver<adl::TYPE_CL>::deallocate(m_internalData->m_solverDataGPU);
+
+		adl::DeviceUtils::deallocate(m_internalData->m_deviceHost);
+		delete m_internalData->m_ddcl;		
+		delete m_internalData;
+	}
+	
+#endif //CL_PLATFORM_AMD
+
+}
+
+
+#ifdef CL_PLATFORM_AMD
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+
+RigidBodyBase::Shape CreateBodyInfo(const btCollisionObject& colObj)
+{
+	RigidBodyBase::Shape shape;
+	const btRigidBody* bulletBody = btRigidBody::upcast(&colObj);
+	if( colObj.isStaticOrKinematicObject() || !bulletBody)
+	{
+
+		//body.m_quat = qtGetIdentity();
+		//body.m_invMass = 0.f;
+		shape.m_initInvInertia = mtZero();
+		shape.m_invInertia = mtZero();
+	}
+	else
+	{
+
+		btVector3 invLocalInertia = bulletBody->getInvInertiaDiagLocal();
+		shape.m_initInvInertia = mtZero();
+		shape.m_initInvInertia.m_row[0].x = invLocalInertia.x();
+		shape.m_initInvInertia.m_row[1].y = invLocalInertia.y();
+		shape.m_initInvInertia.m_row[2].z = invLocalInertia.z();
+
+		btQuaternion q = colObj.getWorldTransform().getRotation();
+		Quaternion qBody;	
+		qBody.x = q.getX();
+		qBody.y = q.getY();
+		qBody.z = q.getZ();
+		qBody.w = q.getW();
+
+		Matrix3x3 m = qtGetRotationMatrix( qBody);
+		Matrix3x3 mT = mtTranspose( m );
+		shape.m_invInertia = mtMul( mtMul( m, shape.m_initInvInertia ), mT );
+		//bulletBody->getInvInertiaTensorWorld();
+
+
+
+
+	//	shape.m_initInvInertia = mtInvert( localInertia );
+	}
+	return shape;
+}
+
+RigidBodyBase::Body CreateRBodyCL(const btCollisionObject& colObj, int shapeIdx)
+{
+	RigidBodyBase::Body bodyCL;
+
+
+	// position
+	const btVector3& p = colObj.getWorldTransform().getOrigin();
+	bodyCL.m_pos.x = p.getX();
+	bodyCL.m_pos.y = p.getY();
+	bodyCL.m_pos.z = p.getZ();
+	bodyCL.m_pos.w = 0.0f;
+
+	// quaternion
+	btQuaternion q = colObj.getWorldTransform().getRotation();
+	bodyCL.m_quat.x = q.getX();
+	bodyCL.m_quat.y = q.getY();
+	bodyCL.m_quat.z = q.getZ();
+	bodyCL.m_quat.w = q.getW();
+
+	const btRigidBody* bulletBody = btRigidBody::upcast(&colObj);
+	if( colObj.isStaticOrKinematicObject() || !bulletBody)
+	{
+		// linear velocity
+		bodyCL.m_linVel = make_float4(0.0f, 0.0f, 0.0f);
+
+		// angular velocity
+		bodyCL.m_angVel = make_float4(0.0f, 0.0f, 0.0f);
+		bodyCL.m_invMass = 0.f;
+	} else
+	{
+		// linear velocity
+		const btVector3& lv = bulletBody->getLinearVelocity();
+		const btVector3& av = bulletBody->getAngularVelocity();
+
+		bodyCL.m_linVel = make_float4(lv.x(),lv.y(),lv.z(),0.0f);
+		// angular velocity
+		bodyCL.m_angVel = make_float4(av.x(),av.y(),av.z(),0.0f);
+		bodyCL.m_invMass = bulletBody->getInvMass();
+	}
+	// shape index
+	bodyCL.m_shapeIdx = shapeIdx; 
+
+
+	// restituition coefficient
+	bodyCL.m_restituitionCoeff = colObj.getRestitution();
+
+	// friction coefficient
+	bodyCL.m_frictionCoeff = colObj.getFriction();
+
+	return bodyCL;
+}
+#endif //CL_PLATFORM_AMD
+
+void CustomCollisionDispatcher::dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher) 
+{
+	BT_PROFILE("CustomCollisionDispatcher::dispatchAllCollisionPairs");
+	{
+	btBroadphasePairArray& overlappingPairArray = pairCache->getOverlappingPairArray();
+	bool bGPU = (m_internalData != 0);
+#ifdef CL_PLATFORM_AMD
+	if ( !bGPU )
+#endif //CL_PLATFORM_AMD
+	{
+		BT_PROFILE("btCollisionDispatcher::dispatchAllCollisionPairs");
+		btCollisionDispatcher::dispatchAllCollisionPairs(pairCache,dispatchInfo,dispatcher);
+	}
+#ifdef CL_PLATFORM_AMD
+
+	else
+	{
+		{
+			BT_PROFILE("refreshContactPoints");
+			//----------------------------------------------------------------
+			// GPU version of convex heightmap narrowphase collision detection
+			//----------------------------------------------------------------
+			for ( int i = 0; i < getNumManifolds(); i++ )
+			{
+				btPersistentManifold* manifold = getManifoldByIndexInternal(i);
+
+
+				btCollisionObject* body0 = (btCollisionObject*)manifold->getBody0();
+				btCollisionObject* body1 = (btCollisionObject*)manifold->getBody1();
+
+				manifold->refreshContactPoints(body0->getWorldTransform(),body1->getWorldTransform());
+			}
+		}
+
+		// OpenCL 
+		int nColPairsFromBP = overlappingPairArray.size();
+		btAssert(MAX_BROADPHASE_COLLISION_CL >= nColPairsFromBP);
+
+		int maxBodyIndex = -1;
+
+		{
+			BT_PROFILE("CreateRBodyCL and GPU pairs");
+			for ( int i=0; i<overlappingPairArray.size(); i++)
+			{
+				btAssert(i<MAX_BROADPHASE_COLLISION_CL);
+
+				btBroadphasePair* pair = &overlappingPairArray[i];
+
+				btCollisionObject* colObj0 = (btCollisionObject*)pair->m_pProxy0->m_clientObject;
+				btCollisionObject* colObj1 = (btCollisionObject*)pair->m_pProxy1->m_clientObject;
+
+				int bodyIndex0 = colObj0->getCompanionId();
+				int bodyIndex1 = colObj1->getCompanionId();
+
+				//keep a one-to-one mapping between Bullet and Adl broadphase pairs
+				(*m_internalData->m_pBufPairsCPU)[i].x = bodyIndex0;
+				(*m_internalData->m_pBufPairsCPU)[i].y = bodyIndex1;
+
+				if (bodyIndex0>=0 && bodyIndex1>=0)
+				{
+					//create companion shapes (if necessary)
+
+					btAssert(colObj0->getCollisionShape()->getShapeType() == CUSTOM_POLYHEDRAL_SHAPE_TYPE);
+					btAssert(colObj1->getCollisionShape()->getShapeType() == CUSTOM_POLYHEDRAL_SHAPE_TYPE);
+
+					CustomConvexShape* convexShape0 = (CustomConvexShape*)colObj0->getCollisionShape();
+					CustomConvexShape* convexShape1 = (CustomConvexShape*)colObj1->getCollisionShape();
+
+					if (convexShape0->m_acceleratedCompanionShapeIndex<0)
+					{
+						convexShape0->m_acceleratedCompanionShapeIndex = m_internalData->m_numAcceleratedShapes;
+						adl::ChNarrowphase<adl::TYPE_CL>::setShape(m_internalData->m_ShapeBuffer, convexShape0->m_ConvexHeightField, convexShape0->m_acceleratedCompanionShapeIndex, 0.0f);
+						m_internalData->m_numAcceleratedShapes++;
+					}
+					if (convexShape1->m_acceleratedCompanionShapeIndex<0)
+					{
+						convexShape1->m_acceleratedCompanionShapeIndex = m_internalData->m_numAcceleratedShapes;
+						adl::ChNarrowphase<adl::TYPE_CL>::setShape(m_internalData->m_ShapeBuffer, convexShape1->m_ConvexHeightField, convexShape1->m_acceleratedCompanionShapeIndex, 0.0f);
+						m_internalData->m_numAcceleratedShapes++;
+					}
+
+					btAssert(m_internalData->m_numAcceleratedShapes<MAX_CONVEX_SHAPES_CL);
+
+					if (bodyIndex0>maxBodyIndex)
+						maxBodyIndex = bodyIndex0;
+					if (bodyIndex1>maxBodyIndex)
+						maxBodyIndex = bodyIndex1;
+
+					btAssert(maxBodyIndex<MAX_CONVEX_BODIES_CL);
+					if (maxBodyIndex>=MAX_CONVEX_BODIES_CL)
+					{
+						printf("error: maxBodyIndex(%d)>MAX_CONVEX_BODIES_CL(%d)\n",maxBodyIndex,MAX_CONVEX_BODIES_CL);
+					}
+
+					(*m_internalData->m_pBufRBodiesCPU)[bodyIndex0] = CreateRBodyCL(*colObj0, convexShape0->m_acceleratedCompanionShapeIndex);
+					m_internalData->m_bodyInfoBufferCPU->m_ptr[bodyIndex0] = CreateBodyInfo(*colObj0);
+					(*m_internalData->m_pBufRBodiesCPU)[bodyIndex1] = CreateRBodyCL(*colObj1, convexShape0->m_acceleratedCompanionShapeIndex);
+					m_internalData->m_bodyInfoBufferCPU->m_ptr[bodyIndex1] = CreateBodyInfo(*colObj1);
+				} else
+				{
+					//TODO: dispatch using default dispatcher
+					btAssert(0);
+				}
+			}
+		}
+
+
+		if (maxBodyIndex>=0)
+		{
+			
+			int numOfConvexRBodies = maxBodyIndex+1;
+
+			
+
+			adl::ChNarrowphaseBase::Config cfgNP;
+			cfgNP.m_collisionMargin = 0.01f;
+			int nContactOut = 0;
+
+			{
+				BT_PROFILE("ChNarrowphase::execute");
+				adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, m_internalData->m_pBufPairsGPU, nColPairsFromBP, m_internalData->m_pBufRBodiesGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
+				adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+			}
+
+
+			bool useCpu = false;//true;
+			bool useSolver = true;//true;//false;
+			
+			if (useSolver)
+			{
+				float dt=1./60.;
+				adl::SolverBase::ConstraintCfg csCfg( dt );
+				csCfg.m_enableParallelSolve = true;
+				csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent;
+				csCfg.m_staticIdx = -1;//numOfConvexRBodies-1;//m_nBodies-1;
+
+			
+			if (useCpu)
+			{
+
+				{
+					BT_PROFILE("read m_pBufContactOutGPU");
+					m_internalData->m_pBufContactOutGPU->read(m_internalData->m_pBufContactOutCPU->m_ptr, nContactOut);//MAX_BROADPHASE_COLLISION_CL);
+					adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+				}
+
+				BT_PROFILE("CPU stuff");
+				adl::Solver<adl::TYPE_HOST>::Data* solverData = adl::Solver<adl::TYPE_HOST>::allocate( m_internalData->m_deviceHost, nContactOut);
+
+				SolverData contactCPU = adl::Solver<adl::TYPE_HOST>::allocateConstraint4( 
+					m_internalData->m_deviceHost, 
+					numOfConvexRBodies*MAX_PAIRS_PER_BODY_CL );
+
+				void* frictionCPU = adl::Solver<adl::TYPE_HOST>::allocateFrictionConstraint( 
+					m_internalData->m_deviceHost, 
+					numOfConvexRBodies*MAX_PAIRS_PER_BODY_CL );
+
+				//write body with current linear/angluar velocities to GPU
+				m_internalData->m_bodyInfoBufferGPU->write(m_internalData->m_bodyInfoBufferCPU->m_ptr,numOfConvexRBodies);
+				adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+
+
+				if (nContactOut)
+				{
+					reorderConvertToConstraints2( 
+						solverData, 
+						m_internalData->m_pBufRBodiesCPU, 
+						m_internalData->m_bodyInfoBufferCPU, 
+						m_internalData->m_pBufContactOutCPU,
+						contactCPU, 
+						frictionCPU, 
+						nContactOut, 
+						csCfg );
+
+					bool forceGPU = true;
+
+					if (forceGPU)
+					{
+
+						SolverData contactCPUcopy = adl::Solver<adl::TYPE_HOST>::allocateConstraint4( 
+							m_internalData->m_deviceHost, 
+							numOfConvexRBodies*MAX_PAIRS_PER_BODY_CL );
+
+							adl::Solver<adl::TYPE_CL>::reorderConvertToConstraints( 
+						m_internalData->m_solverDataGPU, 
+						m_internalData->m_pBufRBodiesGPU, 
+						m_internalData->m_bodyInfoBufferGPU, 
+						m_internalData->m_pBufContactOutGPU,
+						m_internalData->m_contactCGPU, 
+						m_internalData->m_frictionCGPU, 
+						nContactOut, 
+						csCfg );
+
+						adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+						m_internalData->m_contactCGPU->read(contactCPUcopy->m_ptr,nContactOut);
+						adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+
+						
+						//m_internalData->m_contactCGPU->write(contactCPU->m_ptr,nContactOut);
+						adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+						m_internalData->m_solverDataGPU->m_nIterations = 4;
+					
+						adl::Solver<adl::TYPE_CL>::solveContactConstraint( m_internalData->m_solverDataGPU, 
+							m_internalData->m_pBufRBodiesGPU, 
+							m_internalData->m_bodyInfoBufferGPU, 
+							m_internalData->m_contactCGPU,
+							0, 
+							nContactOut );
+
+							adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+
+						//read body updated linear/angular velocities back to CPU
+						m_internalData->m_pBufRBodiesGPU->read(
+							m_internalData->m_pBufRBodiesCPU->m_ptr,numOfConvexRBodies);
+							adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+
+					} else
+					{
+					solverData->m_nIterations = 4;
+					adl::Solver<adl::TYPE_HOST>::solveContactConstraint( solverData, 
+						m_internalData->m_pBufRBodiesCPU, 
+						m_internalData->m_bodyInfoBufferCPU, 
+						contactCPU,
+						0, 
+						nContactOut );
+					}
+
+
+
+					}
+
+				adl::Solver<adl::TYPE_HOST>::deallocateConstraint4( contactCPU );
+				adl::Solver<adl::TYPE_HOST>::deallocateFrictionConstraint( frictionCPU );
+				adl::Solver<adl::TYPE_HOST>::deallocate( solverData );
+
+				
+
+			}
+			else
+			{
+				
+				{
+					BT_PROFILE("rigid body data to GPU buffer");
+					// Transfer rigid body data from CPU buffer to GPU buffer
+					m_internalData->m_pBufRBodiesGPU->write(m_internalData->m_pBufRBodiesCPU->m_ptr, numOfConvexRBodies);
+					m_internalData->m_pBufPairsGPU->write(m_internalData->m_pBufPairsCPU->m_ptr, MAX_BROADPHASE_COLLISION_CL);
+					//write body with current linear/angluar velocities to GPU
+					m_internalData->m_bodyInfoBufferGPU->write(m_internalData->m_bodyInfoBufferCPU->m_ptr,numOfConvexRBodies);
+					adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+				}
+				{
+					BT_PROFILE("GPU reorderConvertToConstraints");
+					adl::Solver<adl::TYPE_CL>::reorderConvertToConstraints( 
+						m_internalData->m_solverDataGPU, 
+						m_internalData->m_pBufRBodiesGPU, 
+						m_internalData->m_bodyInfoBufferGPU, 
+						m_internalData->m_pBufContactOutGPU,
+						m_internalData->m_contactCGPU, 
+						m_internalData->m_frictionCGPU, 
+						nContactOut, 
+						csCfg );
+				}
+
+				{
+					BT_PROFILE("GPU solveContactConstraint");
+				m_internalData->m_solverDataGPU->m_nIterations = 4;
+					
+					adl::Solver<adl::TYPE_CL>::solveContactConstraint( m_internalData->m_solverDataGPU, 
+						m_internalData->m_pBufRBodiesGPU, 
+						m_internalData->m_bodyInfoBufferGPU, 
+						m_internalData->m_contactCGPU,
+						0, 
+						nContactOut );
+	
+					adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+				}
+				{
+					BT_PROFILE("read body velocities back to CPU");
+					//read body updated linear/angular velocities back to CPU
+					m_internalData->m_pBufRBodiesGPU->read(
+						m_internalData->m_pBufRBodiesCPU->m_ptr,numOfConvexRBodies);
+						adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+				}
+
+				
+			}
+
+#if 0
+				if( !m_useGPUPipeline )
+				{	//	CPU
+						BT_PROFILE("CPU solve");
+						{
+							BT_PROFILE("CPU reorderConvertToConstraints");
+
+					SOLVER_CLASS<TYPE_HOST>::reorderConvertToConstraints( solver, m_bodyBuffer, m_bodyInfoBufferCPU, (Buffer<Contact4>*)m_contactBuffer, 
+						contactC, frictionC, m_numContacts, csCfg );
+						}
+						{
+							BT_PROFILE("CPU solveContactConstraint");
+
+					solver->m_nIterations = 4;
+					SOLVER_CLASS<TYPE_HOST>::solveContactConstraint( solver, m_bodyBuffer, m_bodyInfoBufferCPU, contactC, 0, m_numContacts );
+						}
+				}
+				else
+				{
+						BT_PROFILE("GPU solve");
+					{	//	GPU using host buffers
+						{
+							BT_PROFILE("GPU reorderConvertToConstraints");
+
+						Solver<TYPE_CL>::reorderConvertToConstraints( m_solver, m_bodyBuffer, m_bodyInfoBufferCPU, (Buffer<Contact4>*)m_contactBuffer, 
+							contactC, frictionC, m_numContacts, csCfg );
+						}
+						timerEnd();
+
+						timerStart(0);
+						//for(int iter=0; iter<4; iter++)
+						{
+							BT_PROFILE("GPU solveContactConstraint");
+
+							Solver<TYPE_CL>::solveContactConstraint( m_solver, m_bodyBuffer, m_bodyInfoBufferCPU, contactC, frictionC, m_numContacts );
+						}
+						DeviceUtils::waitForCompletion( m_device );
+					}
+				}
+				timerEnd();
+#endif
+
+
+			}
+
+			//if we ran the solver, it will overwrite the batchIdx so we cannot write back the results
+			//try to make it work by writing velocity back to rigid body
+
+			if (useSolver)
+			{
+				
+				BT_PROFILE("writing velocity back to btRigidBody");
+
+				for ( int i=0; i<overlappingPairArray.size(); i++)
+				{
+					btAssert(i<MAX_BROADPHASE_COLLISION_CL);
+
+					btBroadphasePair* pair = &overlappingPairArray[i];
+
+					btCollisionObject* colObj0 = (btCollisionObject*)pair->m_pProxy0->m_clientObject;
+					btCollisionObject* colObj1 = (btCollisionObject*)pair->m_pProxy1->m_clientObject;
+
+					int bodyIndex0 = colObj0->getCompanionId();
+					int bodyIndex1 = colObj1->getCompanionId();
+
+					RigidBodyBase::Body* bA = &m_internalData->m_pBufRBodiesCPU->m_ptr[bodyIndex0];
+					RigidBodyBase::Body* bB = &m_internalData->m_pBufRBodiesCPU->m_ptr[bodyIndex1];
+					btRigidBody* bodyA = btRigidBody::upcast(colObj0);
+					if (bodyA && !bodyA->isStaticOrKinematicObject())
+					{
+						bodyA->setLinearVelocity(btVector3(
+										bA->m_linVel.x,
+										bA->m_linVel.y,
+										bA->m_linVel.z));
+
+						bodyA->setAngularVelocity(btVector3(
+										bA->m_angVel.x,
+										bA->m_angVel.y,
+										bA->m_angVel.z));
+					}
+					btRigidBody* bodyB = btRigidBody::upcast(colObj1);
+					if (bodyB && !bodyB->isStaticOrKinematicObject())
+					{
+						bodyB->setLinearVelocity(btVector3(
+							bB->m_linVel.x,
+							bB->m_linVel.y,
+							bB->m_linVel.z));
+						bodyB->setAngularVelocity(btVector3(
+										bB->m_angVel.x,
+										bB->m_angVel.y,
+										bB->m_angVel.z));
+
+					}
+
+
+
+
+				}
+			} else
+			{
+				BT_PROFILE("copy Contact4 to btPersistentManifold");
+				// Now we got the narrowphase info from GPU and need to update rigid bodies with the info and go back to the original pipeline in Bullet physics. 
+				for ( int i = 0; i < nContactOut; i++ )
+				{
+					Contact4 contact = (*m_internalData->m_pBufContactOutCPU)[i];
+
+					int idxBodyA = contact.m_bodyAPtr;
+					int idxBodyB = contact.m_bodyBPtr;
+
+					btAssert(contact.m_batchIdx>=0);
+					btAssert(contact.m_batchIdx<overlappingPairArray.size());
+
+					btBroadphasePair* pair = &overlappingPairArray[contact.m_batchIdx];
+
+					btCollisionObject* colObj0 = (btCollisionObject*)pair->m_pProxy0->m_clientObject;
+					btCollisionObject* colObj1 = (btCollisionObject*)pair->m_pProxy1->m_clientObject;
+
+					if (!pair->m_algorithm)
+					{
+						pair->m_algorithm = findAlgorithm(colObj0,colObj1,0);
+					}
+
+					btManifoldResult contactPointResult(colObj0, colObj1);
+
+
+					CustomConvexConvexPairCollision* pairAlgo = (CustomConvexConvexPairCollision*) pair->m_algorithm;
+
+					if (!pairAlgo->getManifoldPtr())
+					{
+						pairAlgo->createManifoldPtr(colObj0,colObj1,dispatchInfo);
+					}
+					
+					contactPointResult.setPersistentManifold(pairAlgo->getManifoldPtr());
+					
+					contactPointResult.getPersistentManifold()->refreshContactPoints(colObj0->getWorldTransform(),colObj1->getWorldTransform());
+
+					const btTransform& transA = colObj0->getWorldTransform();
+					const btTransform& transB = colObj1->getWorldTransform();
+
+					int numPoints = contact.getNPoints();
+
+					for ( int k=0; k < numPoints; k++ )
+					{
+						btVector3 normalOnBInWorld(
+							contact.m_worldNormal.x,
+							contact.m_worldNormal.y,
+							contact.m_worldNormal.z);
+						btVector3 pointInWorldOnB(
+							contact.m_worldPos[k].x,
+							contact.m_worldPos[k].y,
+							contact.m_worldPos[k].z);
+
+						btScalar depth = contact.m_worldPos[k].w;
+
+						if (depth<0)
+						{
+							const btVector3 deltaC = transB.getOrigin() - transA.getOrigin();
+
+							normalOnBInWorld.normalize();
+
+							if((deltaC.dot(normalOnBInWorld))>0.0f)
+							{
+								normalOnBInWorld= -normalOnBInWorld;
+
+								contactPointResult.addContactPoint(normalOnBInWorld, pointInWorldOnB, depth);
+							}
+							else
+							{
+								contactPointResult.addContactPoint(normalOnBInWorld, pointInWorldOnB-normalOnBInWorld*depth, depth);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+#endif //CL_PLATFORM_AMD
+	}
+
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef CUSTOM_COLLISION_DISPATCHER_H
+#define CUSTOM_COLLISION_DISPATCHER_H
+
+
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+
+
+#define MAX_CONVEX_BODIES_CL 64*1024
+#define MAX_PAIRS_PER_BODY_CL 32
+#define MAX_CONVEX_SHAPES_CL 8192
+#define MAX_BROADPHASE_COLLISION_CL (MAX_CONVEX_BODIES_CL*MAX_PAIRS_PER_BODY_CL)
+
+
+
+struct	CustomDispatchData;
+
+#ifdef CL_PLATFORM_AMD
+#ifdef __APPLE__
+	#ifdef USE_MINICL
+		#include <MiniCL/cl.h>
+	#else
+		#include <OpenCL/cl.h>
+	#endif
+#else //__APPLE__
+	#ifdef USE_MINICL
+		#include <MiniCL/cl.h>
+	#else
+		#include <CL/cl.h>
+	#endif
+#endif //__APPLE__
+#endif
+
+class CustomCollisionDispatcher : public btCollisionDispatcher
+{
+public:
+	CustomCollisionDispatcher (btCollisionConfiguration* collisionConfiguration
+#ifdef CL_PLATFORM_AMD
+		, cl_context context = NULL,cl_device_id device = NULL,cl_command_queue queue = NULL
+#endif //CL_PLATFORM_AMD
+		);
+
+	virtual ~CustomCollisionDispatcher(void);
+
+protected:
+
+	CustomDispatchData*	m_internalData;
+
+	btBroadphasePair* GetPair(btBroadphasePairArray& pairArray, int idxBodyA, int idxBodyB);
+
+public:
+	virtual void dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher);
+};
+
+#endif //CUSTOM_COLLISION_DISPATCHER_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.cpp
@@ -0,0 +1,409 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "CustomConvexPairCollision.h"
+#include "ConvexHeightFieldShape.h"
+#include "CustomConvexShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "Stubs/AdlContact4.h"
+#include "Stubs/AdlTransform.h"
+
+
+CustomConvexConvexPairCollision::CustomConvexConvexPairCollision(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold)
+:btConvexConvexAlgorithm(mf,ci,body0,body1,simplexSolver,pdSolver,numPerturbationIterations, minimumPointsPerturbationThreshold)
+{
+
+}
+
+CustomConvexConvexPairCollision::~CustomConvexConvexPairCollision()
+{
+
+}
+
+
+#include <Windows.h>
+
+template<typename T>
+T atomAdd(const T* ptr, int value)
+{
+	return (T)InterlockedExchangeAdd((LONG*)ptr, value);
+}
+
+
+
+#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];
+#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
+#define REDUCE_MAX(v, n) {int i=0;\
+	for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }
+#define REDUCE_MIN(v, n) {int i=0;\
+	for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }
+
+int extractManifold(const float4* p, int nPoints, float4& nearNormal, float4& centerOut, 
+					 int contactIdx[4])
+{
+	if( nPoints == 0 ) return 0;
+
+	nPoints = min2( nPoints, 64 );
+
+	float4 center = make_float4(0.f);
+	{
+		float4 v[64];
+		memcpy( v, p, nPoints*sizeof(float4) );
+		PARALLEL_SUM( v, nPoints );
+		center = v[0]/(float)nPoints;
+	}
+
+	centerOut = center;
+
+	{	//	sample 4 directions
+		if( nPoints < 4 )
+		{
+			for(int i=0; i<nPoints; i++) contactIdx[i] = i;
+			return nPoints;
+		}
+
+		float4 aVector = p[0] - center;
+		float4 u = cross3( nearNormal, aVector );
+		float4 v = cross3( nearNormal, u );
+		u = normalize3( u );
+		v = normalize3( v );
+
+		int idx[4];
+
+		float2 max00 = make_float2(0,FLT_MAX);
+		{
+			float4 dir0 = u;
+			float4 dir1 = -u;
+			float4 dir2 = v;
+			float4 dir3 = -v;
+
+			//	idx, distance
+			{
+				{
+					int4 a[64];
+					for(int ie = 0; ie<nPoints; ie++ )
+					{
+						float4 f;
+						float4 r = p[ie]-center;
+						f.x = dot3F4( dir0, r );
+						f.y = dot3F4( dir1, r );
+						f.z = dot3F4( dir2, r );
+						f.w = dot3F4( dir3, r );
+
+						a[ie].x = ((*(u32*)&f.x) & 0xffffff00);
+						a[ie].x |= (0xff & ie);
+
+						a[ie].y = ((*(u32*)&f.y) & 0xffffff00);
+						a[ie].y |= (0xff & ie);
+
+						a[ie].z = ((*(u32*)&f.z) & 0xffffff00);
+						a[ie].z |= (0xff & ie);
+
+						a[ie].w = ((*(u32*)&f.w) & 0xffffff00);
+						a[ie].w |= (0xff & ie);
+					}
+
+					for(int ie=0; ie<nPoints; ie++)
+					{
+						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;
+						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;
+						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;
+						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;
+					}
+
+					idx[0] = (int)a[0].x & 0xff;
+					idx[1] = (int)a[0].y & 0xff;
+					idx[2] = (int)a[0].z & 0xff;
+					idx[3] = (int)a[0].w & 0xff;
+				}
+			}
+
+			{
+				float2 h[64];
+				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );
+				REDUCE_MIN( h, nPoints );
+				max00 = h[0];
+			}
+		}
+
+		contactIdx[0] = idx[0];
+		contactIdx[1] = idx[1];
+		contactIdx[2] = idx[2];
+		contactIdx[3] = idx[3];
+
+//		if( max00.y < 0.0f )
+//			contactIdx[0] = (int)max00.x;
+
+		std::sort( contactIdx, contactIdx+4 );
+
+		return 4;
+	}
+}
+
+#undef PARALLEL_SUM
+#undef PARALLEL_DO
+#undef REDUCE_MAX
+#undef REDUCE_MIX
+
+int collideStraight(const ConvexHeightField* shapeA,const ConvexHeightField* shapeB,
+		const float4& bodyApos, Quaternion& bodyAquat,const float4& bodyBpos,const Quaternion& bodyBquat,
+		ContactPoint4* contactsOut, int& numContacts, int contactCapacity,
+		float collisionMargin )
+{
+//	Stopwatch sw;
+
+	Transform trA;
+	trA = trSetTransform(bodyApos,bodyAquat);
+	Transform trB;
+	trB = trSetTransform(bodyBpos, bodyBquat);
+	
+	Transform B2A;
+	{
+		Transform invTrA = trInvert( trA );
+		B2A = trMul( invTrA, trB );
+	}
+
+	int nContacts = 0;
+	{	// testB against A
+		float4 p[ConvexHeightField::HEIGHT_RES*ConvexHeightField::HEIGHT_RES*6];
+		int nHits = 0;
+
+		const float4* pInB = shapeB->getSamplePoints();
+
+		float4 baInB = qtInvRotate( bodyBquat, bodyApos - bodyBpos );
+		if( shapeA->m_type == CollisionShape::SHAPE_HEIGHT_FIELD ) 
+			baInB = make_float4(0,0,0,0);
+
+//		sw.start();
+		for(int iface=0; iface<6; iface++)
+		{
+			Aabb aabb = shapeB->m_faceAabbs[iface];
+
+			aabb.transform( B2A.m_translation, B2A.m_rotation );
+
+			if( !shapeA->m_aabb.overlaps( aabb ) ) continue;
+			
+			for(int ip=0; ip<ConvexHeightField::HEIGHT_RES*ConvexHeightField::HEIGHT_RES; ip++)
+			{
+				int i = iface*ConvexHeightField::HEIGHT_RES*ConvexHeightField::HEIGHT_RES+ip;
+
+				if( dot3F4( baInB, pInB[i] ) < 0.f ) continue;
+
+				float4 pInA = trMul1( B2A, pInB[i] );
+
+				if( shapeA->m_aabb.overlaps( pInA ) )
+				{
+//					Stopwatch sw1;
+//					sw1.start();
+					float dist = shapeA->queryDistance( pInA );
+//					sw1.stop();
+//					m_times[TIME_SAMPLE] += sw1.getMs();
+
+					if( dist < collisionMargin )
+					{
+						p[nHits] = make_float4(pInA.x, pInA.y, pInA.z, dist);
+						nHits++;
+					}
+				}
+			}
+		}
+//		sw.stop();
+//		m_times[TIME_TEST] += sw.getMs();
+
+//		sw.start();
+		if( nHits )
+		{
+			float4 ab = bodyBpos - bodyApos;
+			ab = qtInvRotate( bodyAquat, ab );
+			if( shapeA->m_type == CollisionShape::SHAPE_HEIGHT_FIELD )
+			{
+				//todo.	sample normal from height field but just fake here
+				ab = make_float4(0,1,0,0);
+			}
+
+			int cIdx[4];
+			float4 center;
+			
+			nContacts = extractManifold( p, nHits, ab, center, cIdx );
+
+			float4 contactNormal;
+			{
+				shapeA->queryDistanceWithNormal( center, contactNormal );
+				contactNormal = normalize3( contactNormal );
+
+//				u32 cmp = u8vCompress( contactNormal );
+//				contactNormal = make_float4( u8vGetX(cmp), u8vGetY(cmp), u8vGetZ(cmp), 0 );
+			}
+
+			int writeIdx = atomAdd( &numContacts, 1 );
+			if( writeIdx+1 < contactCapacity )
+			{
+				ContactPoint4& c = contactsOut[writeIdx];
+				nContacts = min2( nContacts, 4 );
+				for(int i=0; i<nContacts; i++)
+				{
+					c.m_worldPos[i] = transform( p[cIdx[i]], bodyApos, bodyAquat );
+					c.m_worldPos[i].w = max2( p[cIdx[i]].w - collisionMargin, -2*collisionMargin );
+				}
+				c.m_worldNormal = normalize3( qtRotate( bodyAquat, contactNormal ) );
+				c.m_restituitionCoeff = 0.f;
+				c.m_frictionCoeff = 0.7f;
+				//c.m_bodyAPtr = (void*)bodyAIdx;
+				//c.m_bodyBPtr = (void*)bodyBIdx;
+				c.getNPoints() = nContacts;
+			}
+		}
+//		sw.stop();
+//		m_times[TIME_MANIFOLD] += sw.getMs();
+	}
+
+	return nContacts;
+}
+
+
+void	CustomConvexConvexPairCollision::createManifoldPtr(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo)
+{
+	m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+	m_ownManifold = true;
+}
+
+	
+void CustomConvexConvexPairCollision::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+{
+#if 1
+	if (!m_manifoldPtr)
+	{
+		//swapped?
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+		m_ownManifold = true;
+	}
+	resultOut->setPersistentManifold(m_manifoldPtr);
+
+
+	CustomConvexShape* convex0 = (CustomConvexShape*)body0->getCollisionShape();
+	CustomConvexShape* convex1 = (CustomConvexShape*)body1->getCollisionShape();
+
+	
+	float4 bodyApos;
+	float4 bodyBpos;
+	Quaternion bodyAquat;
+	Quaternion bodyBquat;
+
+	const btTransform& transA = body0->getWorldTransform();
+	const btTransform& transB = body1->getWorldTransform();
+
+	const btVector3& pA = body0->getWorldTransform().getOrigin();
+	const btVector3& pB = body1->getWorldTransform().getOrigin();
+
+	btQuaternion qA = body0->getWorldTransform().getRotation();
+	btQuaternion qB = body1->getWorldTransform().getRotation();
+
+	bodyApos.x = pA.getX();
+	bodyApos.y = pA.getY();
+	bodyApos.z = pA.getZ();
+	bodyApos.w = 0.f;
+	
+	bodyBpos.x = pB.getX();
+	bodyBpos.y = pB.getY();
+	bodyBpos.z = pB.getZ();
+	bodyBpos.w = 0.f;
+	
+	bodyAquat.x = qA.getX();
+	bodyAquat.y = qA.getY();
+	bodyAquat.z = qA.getZ();
+	bodyAquat.w = qA.getW();
+
+	bodyBquat.x = qB.getX();
+	bodyBquat.y = qB.getY();
+	bodyBquat.z = qB.getZ();
+	bodyBquat.w = qB.getW();
+
+
+#define CAPACITY_CONTACTS 4
+
+	ContactPoint4 contactsOut[CAPACITY_CONTACTS];
+	int freeContactIndex = 0;
+	int contactCapacity = CAPACITY_CONTACTS;
+	float collisionMargin = 0.001f;
+
+	m_manifoldPtr->refreshContactPoints(body0->getWorldTransform(),body1->getWorldTransform());
+
+	collideStraight(convex0->m_ConvexHeightField,convex1->m_ConvexHeightField,
+		bodyApos, bodyAquat,bodyBpos,bodyBquat,
+		contactsOut, freeContactIndex, contactCapacity,
+		collisionMargin );
+	collideStraight(convex1->m_ConvexHeightField,convex0->m_ConvexHeightField,
+		bodyBpos, bodyBquat,bodyApos,bodyAquat,
+		contactsOut, freeContactIndex, contactCapacity,
+		collisionMargin );
+
+	//copy points into manifold
+	//refresh manifold
+
+	btAssert(freeContactIndex<3);
+	for (int j=0;j<freeContactIndex;j++)
+	{
+		int numPoints = contactsOut[j].getNPoints();
+//		printf("numPoints = %d\n",numPoints);
+
+		for (int i=0;i<numPoints;i++)
+		{
+
+			ContactPoint4& c = contactsOut[j];
+
+			btVector3 normalOnBInWorld(
+				c.m_worldNormal.x,
+				c.m_worldNormal.y,
+				c.m_worldNormal.z);
+			btVector3 pointInWorldOnB(
+				c.m_worldPos[i].x,
+				c.m_worldPos[i].y,
+				c.m_worldPos[i].z);
+			btScalar depth = c.m_worldPos[i].w;
+			if (depth<0)
+			{
+
+				const btVector3 deltaC = transB.getOrigin() - transA.getOrigin();
+				if((deltaC.dot(normalOnBInWorld))>0.0f)
+				{
+					normalOnBInWorld= -normalOnBInWorld;
+				}
+				normalOnBInWorld.normalize();
+				if (j)
+				{
+					resultOut->addContactPoint(normalOnBInWorld, pointInWorldOnB, depth);
+				} else
+				{
+					resultOut->addContactPoint(normalOnBInWorld, pointInWorldOnB-normalOnBInWorld*depth, depth);
+				}
+			}
+		}
+	}
+#else
+	btConvexConvexAlgorithm::processCollision(body0,body1,dispatchInfo,resultOut);
+#endif
+}
+
+
+
+CustomConvexConvexPairCollision::CreateFunc::CreateFunc(btSimplexSolverInterface*			simplexSolver, btConvexPenetrationDepthSolver* pdSolver)
+:btConvexConvexAlgorithm::CreateFunc(simplexSolver,pdSolver)
+{
+}
+		
+CustomConvexConvexPairCollision::CreateFunc::~CreateFunc()
+{
+
+}
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.h
@@ -0,0 +1,56 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef CUSTOM_CONVEX_CONVEX_PAIR_COLLISION_H
+#define CUSTOM_CONVEX_CONVEX_PAIR_COLLISION_H
+
+
+#include "BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h"
+
+class CustomConvexConvexPairCollision : public btConvexConvexAlgorithm
+{
+	public:
+
+	CustomConvexConvexPairCollision(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
+	virtual ~CustomConvexConvexPairCollision();
+
+	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+
+	btPersistentManifold*	getManifoldPtr()
+	{
+		return m_manifoldPtr;
+	}
+
+	void	createManifoldPtr(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo);
+
+	struct CreateFunc :public 	btConvexConvexAlgorithm::CreateFunc
+	{
+
+		CreateFunc(btSimplexSolverInterface*			simplexSolver, btConvexPenetrationDepthSolver* pdSolver);
+		
+		virtual ~CreateFunc();
+
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		{
+			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(CustomConvexConvexPairCollision));
+			return new(mem) CustomConvexConvexPairCollision(ci.m_manifold,ci,body0,body1,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+		}
+	};
+	
+
+};
+
+
+#endif //CUSTOM_CONVEX_CONVEX_PAIR_COLLISION_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.cpp
@@ -0,0 +1,45 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "CustomConvexShape.h"
+#include "ConvexHeightFieldShape.h"
+#include "BulletCollision/CollisionShapes/btConvexPolyhedron.h"
+
+
+CustomConvexShape::CustomConvexShape(const btScalar* points,int numPoints, int stride)
+:btConvexHullShape(points,numPoints,stride),
+m_acceleratedCompanionShapeIndex(-1)
+{
+	m_shapeType = CUSTOM_POLYHEDRAL_SHAPE_TYPE;
+
+	initializePolyhedralFeatures();
+	int numFaces= m_polyhedron->m_faces.size();
+	float4* eqn = new float4[numFaces];
+	for (int i=0;i<numFaces;i++)
+	{
+		eqn[i].x = m_polyhedron->m_faces[i].m_plane[0];
+		eqn[i].y = m_polyhedron->m_faces[i].m_plane[1];
+		eqn[i].z = m_polyhedron->m_faces[i].m_plane[2];
+		eqn[i].w = m_polyhedron->m_faces[i].m_plane[3];
+	}
+	
+	m_ConvexHeightField = new ConvexHeightField(eqn,numFaces);
+
+}
+
+CustomConvexShape::~CustomConvexShape()
+{
+	delete m_ConvexHeightField;
+}
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.h
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef CUSTOM_CONVEX_SHAPE_H
+#define CUSTOM_CONVEX_SHAPE_H
+
+#include "BulletCollision/CollisionShapes/btConvexHullShape.h"
+
+class CustomConvexShape  : public btConvexHullShape
+{
+	public:
+		
+		class ConvexHeightField* m_ConvexHeightField;
+
+		int m_acceleratedCompanionShapeIndex;
+
+		CustomConvexShape(const btScalar* points,int numPoints,int stride);
+		virtual ~CustomConvexShape();
+		
+};
+
+#endif //CUSTOM_CONVEX_SHAPE_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.h
@@ -0,0 +1,230 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef AABB_H
+#define AABB_H
+
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlQuaternion.h"
+
+enum AdlCollisionShapeTypes
+{
+	ADL_SHAPE_SPHERE=2,
+	ADL_SHAPE_HEIGHT_FIELD,
+	SHAPE_CONVEX_HEIGHT_FIELD,
+};
+
+_MEM_CLASSALIGN16
+struct Aabb
+{
+	public:
+		_MEM_ALIGNED_ALLOCATOR16;
+
+		__inline
+		void setEmpty();
+		__inline
+		void includeVolume( const Aabb& aabb );
+		__inline
+		void includePoint( const float4& p );
+		__inline
+		bool overlaps( const float4& p ) const;
+		__inline
+		bool overlaps( const Aabb& aabb ) const;
+		__inline
+		float4 center() const;
+		__inline
+		int getMajorAxis() const;
+		__inline
+		float4 getExtent() const;
+		__inline
+		void expandBy( const float4& r );
+
+		__inline
+		static bool overlaps( const Aabb& a, const Aabb& b );
+
+		__inline
+		bool intersect(const float4* from, const float4* to, const float4* invRay) const;
+
+		__inline
+		void transform(const float4& translation, const Quaternion& quat);
+
+		__inline
+		void transform(const float4& translation, const Matrix3x3& rot);
+
+	public:
+		float4 m_max;
+		float4 m_min;
+};
+
+void Aabb::setEmpty()
+{
+	m_max = make_float4( -FLT_MAX );
+	m_min = make_float4( FLT_MAX );
+}
+
+void Aabb::includeVolume(const Aabb& aabb)
+{
+	m_max.x = max2( m_max.x, aabb.m_max.x );
+	m_min.x = min2( m_min.x, aabb.m_min.x );
+
+	m_max.y = max2( m_max.y, aabb.m_max.y );
+	m_min.y = min2( m_min.y, aabb.m_min.y );
+
+	m_max.z = max2( m_max.z, aabb.m_max.z );
+	m_min.z = min2( m_min.z, aabb.m_min.z );
+}
+
+void Aabb::includePoint( const float4& p )
+{
+	m_max.x = max2( m_max.x, p.x );
+	m_min.x = min2( m_min.x, p.x );
+
+	m_max.y = max2( m_max.y, p.y );
+	m_min.y = min2( m_min.y, p.y );
+
+	m_max.z = max2( m_max.z, p.z );
+	m_min.z = min2( m_min.z, p.z );
+}
+
+bool Aabb::overlaps( const float4& p ) const
+{
+	float4 dx = m_max-p;
+	float4 dm = p-m_min;
+
+	return (dx.x >= 0 && dx.y >= 0 && dx.z >= 0)
+		&& (dm.x >= 0 && dm.y >= 0 && dm.z >= 0);
+}
+
+bool Aabb::overlaps( const Aabb& in ) const
+{
+/*
+	if( m_max.x < in.m_min.x || m_min.x > in.m_max.x ) return false;
+	if( m_max.y < in.m_min.y || m_min.y > in.m_max.y ) return false;
+	if( m_max.z < in.m_min.z || m_min.z > in.m_max.z ) return false;
+
+	return true;
+*/
+	return overlaps( *this, in );
+}
+
+bool Aabb::overlaps( const Aabb& a, const Aabb& b )
+{
+	if( a.m_max.x < b.m_min.x || a.m_min.x > b.m_max.x ) return false;
+	if( a.m_max.y < b.m_min.y || a.m_min.y > b.m_max.y ) return false;
+	if( a.m_max.z < b.m_min.z || a.m_min.z > b.m_max.z ) return false;
+
+	return true;
+}
+
+float4 Aabb::center() const
+{
+	return 0.5f*(m_max+m_min);
+}
+
+int Aabb::getMajorAxis() const
+{
+	float4 extent = getExtent();
+
+	int majorAxis = 0;
+	if( extent.s[1] > extent.s[0] )
+		majorAxis = 1;
+	if( extent.s[2] > extent.s[majorAxis] )
+		majorAxis = 2;
+
+	return majorAxis;
+}
+
+float4 Aabb::getExtent() const
+{
+	return m_max-m_min;
+}
+
+void Aabb::expandBy( const float4& r )
+{
+	m_max += r;
+	m_min -= r;
+}
+
+bool Aabb::intersect(const float4* from, const float4* to, const float4* invRay) const
+{
+	float4 dFar;
+	dFar = (m_max - *from);
+	dFar *= *invRay;
+	float4 dNear;
+	dNear = (m_min - *from);
+	dNear *= *invRay;
+		
+	float4 tFar; 
+	tFar = max2(dFar, dNear);
+	float4 tNear; 
+	tNear = min2(dFar, dNear);
+
+	float farf[] = { tFar.x, tFar.y, tFar.z };
+
+	float nearf[] = { tNear.x, tNear.y, tNear.z };
+
+	float minFar = min2(farf[0], min2(farf[1], farf[2]));
+	float maxNear = max2(nearf[0], max2(nearf[1], nearf[2]));
+	
+	minFar = min2(1.0f, minFar );
+	maxNear = max2(0.0f, maxNear);
+	
+	return (minFar >= maxNear);
+}
+
+void Aabb::transform(const float4& translation, const Matrix3x3& m)
+{
+	float4 c = center();
+
+	Aabb& ans = *this;
+
+	float4 e[] = { m.m_row[0]*m_min, m.m_row[1]*m_min, m.m_row[2]*m_min };
+	float4 f[] = { m.m_row[0]*m_max, m.m_row[1]*m_max, m.m_row[2]*m_max };
+	ans.m_max = ans.m_min = translation;
+
+	{	int j=0;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.x += mi.x+mi.y+mi.z;
+		ans.m_max.x += ma.x+ma.y+ma.z;
+	}
+
+	{	int j=1;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.y += mi.x+mi.y+mi.z;
+		ans.m_max.y += ma.x+ma.y+ma.z;
+	}
+
+	{	int j=2;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.z += mi.x+mi.y+mi.z;
+		ans.m_max.z += ma.x+ma.y+ma.z;
+	}
+}
+
+void Aabb::transform(const float4& translation, const Quaternion& quat)
+{
+	Matrix3x3 m = qtGetRotationMatrix( quat );
+
+	transform( translation, m );
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlArray.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlArray.h
@@ -0,0 +1,212 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ARRAY_H
+#define ARRAY_H
+
+#include <string.h>
+#include <malloc.h>
+#include <Common/Base/Error.h>
+#include <new.h>
+
+
+template <class T>
+class Array
+{
+	public:
+		__inline
+		Array();
+		__inline
+		Array(int size);
+		__inline
+		~Array();
+		__inline
+		T& operator[] (int idx);
+		__inline
+		const T& operator[] (int idx) const;
+		__inline
+		void pushBack(const T& elem);
+		__inline
+		void popBack();
+		__inline
+		void clear();
+		__inline
+		void setSize(int size);
+		__inline
+		int getSize() const;
+		__inline
+		T* begin();
+		__inline
+		const T* begin() const;
+		__inline
+		int indexOf(const T& data) const;
+		__inline
+		void removeAt(int idx);
+		__inline
+		T& expandOne();
+
+	private:
+		Array(const Array& a){}
+
+	private:
+		enum
+		{
+			DEFAULT_SIZE = 128,
+			INCREASE_SIZE = 128,
+		};
+
+		T* m_data;
+		int m_size;
+		int m_capacity;
+};
+
+template<class T>
+Array<T>::Array()
+{
+	m_size = 0;
+	m_capacity = DEFAULT_SIZE;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::Array(int size)
+{
+	m_size = size;
+	m_capacity = size;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::~Array()
+{
+	if( m_data )
+	{
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = NULL;
+	}
+}
+
+template<class T>
+T& Array<T>::operator[](int idx)
+{
+	CLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+const T& Array<T>::operator[](int idx) const
+{
+	CLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+void Array<T>::pushBack(const T& elem)
+{
+	if( m_size == m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity += INCREASE_SIZE;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_data[ m_size++ ] = elem;
+}
+
+template<class T>
+void Array<T>::popBack()
+{
+	CLASSERT( m_size>0 );
+	m_size--;
+}
+
+template<class T>
+void Array<T>::clear()
+{
+	m_size = 0;
+}
+
+template<class T>
+void Array<T>::setSize(int size)
+{
+	if( size > m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity = size;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		for(int i=0; i<m_capacity; i++) new(&s[i])T;
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_size = size;
+}
+
+template<class T>
+int Array<T>::getSize() const
+{
+	return m_size;
+}
+
+template<class T>
+const T* Array<T>::begin() const
+{
+	return m_data;
+}
+
+template<class T>
+T* Array<T>::begin()
+{
+	return m_data;
+}
+
+template<class T>
+int Array<T>::indexOf(const T& data) const
+{
+	for(int i=0; i<m_size; i++)
+	{
+		if( data == m_data[i] ) return i;
+	}
+	return -1;
+}
+
+template<class T>
+void Array<T>::removeAt(int idx)
+{
+	CLASSERT(idx<m_size);
+	m_data[idx] = m_data[--m_size];
+}
+
+template<class T>
+T& Array<T>::expandOne()
+{
+	setSize( m_size+1 );
+	return m_data[ m_size-1 ];
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollideUtils.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollideUtils.h
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef COLLIDE_UTILS_H
+#define COLLIDE_UTILS_H
+
+#include "Stubs/AdlMath.h"
+
+
+class CollideUtils
+{
+	public:
+		template<bool FLIPSIGN>
+		static bool collide(const float4& a, const float4& b, const float4& c, const float4& p, float4& normalOut, float margin = 0.f);
+
+		__inline
+		static float castRay(const float4& v0, const float4& v1, const float4& v2,
+			 const float4& rayFrom, const float4& rayTo, float margin = 0.0f, float4* bCrdOut = NULL);
+
+};
+
+
+template<bool FLIPSIGN>
+bool CollideUtils::collide(const float4& a, const float4& b, const float4& c, const float4& p, float4& normalOut, float margin)
+{
+	float4 ab, bc, ca;
+	ab = b-a;
+	bc = c-b;
+	ca = a-c;
+
+	float4 ap, bp, cp;
+	ap = p-a;
+	bp = p-b;
+	cp = p-c;
+
+	float4 n;
+	n = cross3(ab, -1.f*ca);
+
+	float4 abp = cross3( ab, ap );
+	float4 bcp = cross3( bc, bp );
+	float4 cap = cross3( ca, cp );
+
+	float s0 = dot3F4(n,abp);
+	float s1 = dot3F4(n,bcp);
+	float s2 = dot3F4(n,cap);
+
+//	if(( s0<0.f && s1<0.f && s2<0.f ) || ( s0>0.f && s1>0.f && s2>0.f ))
+	if(( s0<margin && s1<margin && s2<margin ) || ( s0>-margin && s1>-margin && s2>-margin ))
+	{
+		n = normalize3( n );
+		n.w = dot3F4(n,ap);
+
+		normalOut = (FLIPSIGN)? -n : n;
+		return true;
+	}
+
+	return false;
+}
+
+__inline
+float CollideUtils::castRay(const float4& v0, const float4& v1, const float4& v2,
+			 const float4& rayFrom, const float4& rayTo, float margin, float4* bCrdOut)
+{
+	float t, v, w;
+	float4 ab; ab = v1 - v0;
+	float4 ac; ac = v2 - v0;
+	float4 qp; qp = rayFrom - rayTo;
+	float4 normal = cross3( ab, ac );
+	float d = dot3F4( qp, normal );
+	float odd = 1.f/d;
+	float4 ap; ap = rayFrom - v0;
+	t = dot3F4( ap, normal );
+	t *= odd;
+//	if( t < 0.f || t > 1.f ) return -1;
+
+	float4 e = cross3( qp, ap );
+	v = dot3F4( ac, e );
+	v *= odd;
+	if( v < -margin || v > 1.f+margin ) return -1;
+	w = -dot3F4( ab, e );
+	w *= odd;
+//	if( w < 0.f || w > 1.f ) return -1;
+	if( w < -margin || w > 1.f+margin ) return -1;
+
+	float u = 1.f-v-w;
+	if( u < -margin || u > 1.f+margin ) return -1;
+	
+	if( bCrdOut )
+	{
+		bCrdOut->x = u;
+		bCrdOut->y = v;
+		bCrdOut->z = w;
+	}
+	return t;
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollisionShape.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollisionShape.h
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef COLLISION_SHAPE_H
+#define COLLISION_SHAPE_H
+
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlAabb.h"
+
+
+_MEM_CLASSALIGN16
+class CollisionShape
+{
+	public:
+		_MEM_ALIGNED_ALLOCATOR16;
+
+		enum Type
+		{
+			SHAPE_HEIGHT_FIELD,
+			SHAPE_CONVEX_HEIGHT_FIELD,
+			SHAPE_PLANE,
+			MAX_NUM_SHAPE_TYPES,
+		};
+
+		CollisionShape( Type type, float collisionMargin = 0.0025f ) : m_type( type ){ m_collisionMargin = collisionMargin; }
+		virtual ~CollisionShape(){}
+		virtual float queryDistance(const float4& p) const = 0;
+		virtual bool queryDistanceWithNormal(const float4& p, float4& normalOut) const = 0;
+
+	public:
+		Type m_type;
+		Aabb m_aabb;
+		float m_collisionMargin;
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlConstraint4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlConstraint4.h
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_CONSTRAINT4_H
+#define ADL_CONSTRAINT4_H
+
+
+
+struct Constraint4
+		{
+			_MEM_ALIGNED_ALLOCATOR16;
+
+			float4 m_linear;
+			float4 m_worldPos[4];
+			float4 m_center;	//	friction
+			float m_jacCoeffInv[4];
+			float m_b[4];
+			float m_appliedRambdaDt[4];
+
+			float m_fJacCoeffInv[2];	//	friction
+			float m_fAppliedRambdaDt[2];	//	friction
+
+			u32 m_bodyA;
+			u32 m_bodyB;
+
+			u32 m_batchIdx;
+			u32 m_paddings[1];
+
+			__inline
+			void setFrictionCoeff(float value) { m_linear.w = value; }
+			__inline
+			float getFrictionCoeff() const { return m_linear.w; }
+		};
+
+#endif //ADL_CONSTRAINT4_H
+		
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlContact4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlContact4.h
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_CONTACT4_H
+#define ADL_CONTACT4_H
+
+#ifdef CL_PLATFORM_AMD
+#include "AdlConstraint4.h"
+#include "Adl/Adl.h"
+
+typedef adl::Buffer<Constraint4>* SolverData;
+#else
+typedef void* SolverData;
+#endif
+
+typedef void* ShapeDataType;
+
+
+struct Contact4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+
+	float4 m_worldPos[4];
+	float4 m_worldNormal;
+//	float m_restituitionCoeff;
+//	float m_frictionCoeff;
+	u16 m_restituitionCoeffCmp;
+	u16 m_frictionCoeffCmp;
+	int m_batchIdx;
+
+	u32 m_bodyAPtr;
+	u32 m_bodyBPtr;
+
+	//	todo. make it safer
+	int& getBatchIdx() { return m_batchIdx; }
+	float getRestituitionCoeff() const { return ((float)m_restituitionCoeffCmp/(float)0xffff); }
+	void setRestituitionCoeff( float c ) { ADLASSERT( c >= 0.f && c <= 1.f ); m_restituitionCoeffCmp = (u16)(c*0xffff); }
+	float getFrictionCoeff() const { return ((float)m_frictionCoeffCmp/(float)0xffff); }
+	void setFrictionCoeff( float c ) { ADLASSERT( c >= 0.f && c <= 1.f ); m_frictionCoeffCmp = (u16)(c*0xffff); }
+
+	float& getNPoints() { return m_worldNormal.w; }
+	float getNPoints() const { return m_worldNormal.w; }
+
+	float getPenetration(int idx) const { return m_worldPos[idx].w; }
+
+	bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+};
+
+struct ContactPoint4
+		{
+			float4 m_worldPos[4];
+			union
+			{
+				float4 m_worldNormal;
+
+				struct Data
+				{
+					int m_padding[3];
+					float m_nPoints;	//	for cl
+				}m_data;
+
+			};
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+//			int m_nPoints;
+//			int m_padding0;
+
+			void* m_bodyAPtr;
+			void* m_bodyBPtr;
+//			int m_padding1;
+//			int m_padding2;
+
+			float& getNPoints() { return m_data.m_nPoints; }
+			float getNPoints() const { return m_data.m_nPoints; }
+
+			float getPenetration(int idx) const { return m_worldPos[idx].w; }
+
+//			__inline
+//			void load(int idx, const ContactPoint& src);
+//			__inline
+//			void store(int idx, ContactPoint& dst) const;
+
+			bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+
+		};
+
+
+#endif //ADL_CONTACT4_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlError.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlError.h
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_ERROR_H
+#define CL_ERROR_H
+
+#ifdef DX11RENDER
+#include <windows.h>
+#endif
+
+#ifdef _DEBUG
+	#include <assert.h>
+	#define CLASSERT(x) if(!(x)){__debugbreak(); }
+	#define ADLASSERT(x) if(!(x)){__debugbreak(); }
+#else
+	#define CLASSERT(x) if(x){}
+	#define ADLASSERT(x) if(x){}
+
+#endif
+
+
+
+
+#ifdef _DEBUG
+	#define COMPILE_TIME_ASSERT(x) {int compileTimeAssertFailed[x]; compileTimeAssertFailed[0];}
+#else
+	#define COMPILE_TIME_ASSERT(x)
+#endif
+
+#ifdef _DEBUG
+	#include <stdarg.h>
+	#include <stdio.h>
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+		va_list arg;
+		va_start(arg, fmt);
+#ifdef DX11RENDER
+		char buf[256];
+		vsprintf_s( buf, 256, fmt, arg );
+#ifdef UNICODE
+		WCHAR wbuf[256];
+		int sizeWide = MultiByteToWideChar(0,0,buf,-1,wbuf,0);
+		MultiByteToWideChar(0,0,buf,-1,wbuf,sizeWide);
+
+//		swprintf_s( wbuf, 256, L"%s", buf );
+		OutputDebugString( wbuf );
+#else
+		OutputDebugString( buf );
+#endif
+#else
+		vprintf(fmt, arg);
+#endif
+		va_end(arg);
+	}
+#else
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+	}
+#endif
+
+
+#define WARN(msg) debugPrintf("WARNING: %s\n", msg);
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMath.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMath.h
@@ -0,0 +1,216 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_MATH_H
+#define CL_MATH_H
+
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <xmmintrin.h>
+
+
+#include "AdlError.h"
+#include <algorithm>
+#define pxSort std::sort
+
+#define PI       3.14159265358979323846f
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+
+#define _MEM_CLASSALIGN16 __declspec(align(16))
+#define _MEM_ALIGNED_ALLOCATOR16 	void* operator new(size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete(void *p) { _aligned_free( p ); } \
+	void* operator new[](size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete[](void *p) { _aligned_free( p ); } \
+	void* operator new(size_t size, void* p) { return p; } \
+	void operator delete(void *p, void* pp) {} 
+
+
+
+template<class T>
+T nextPowerOf2(T n)
+{
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+}
+
+
+_MEM_CLASSALIGN16
+struct float4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			float x,y,z,w;
+		};
+		struct
+		{
+			float s[4];
+		};
+		__m128 m_quad;
+	};
+};
+
+__forceinline
+unsigned int isZero(const float4& a)
+{
+	return (a.x == 0.f) & (a.y == 0.f) & (a.z == 0.f) & (a.w == 0.f);
+}
+
+_MEM_CLASSALIGN16
+struct int4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+struct int2
+{
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+struct float2
+{
+	union
+	{
+		struct
+		{
+			float x,y;
+		};
+		struct
+		{
+			float s[2];
+		};
+	};
+};
+
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+
+
+#include "Adlfloat4.inl"
+//#include <Common/Math/float4SSE.inl>
+
+
+
+
+template<typename T>
+void swap2(T& a, T& b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+
+__inline
+void randSeed(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T randRange(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+template<>
+__inline
+float4 randRange(const float4& minV, const float4& maxV)
+{
+	float4 r = make_float4( (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f );
+	float4 range = maxV - minV;
+	return (minV + r*range);
+}
+
+
+struct SortData
+{
+	union
+	{
+		u32 m_key;
+		struct { u16 m_key16[2]; };
+	};
+	u32 m_value;
+
+	friend bool operator <(const SortData& a, const SortData& b)
+	{
+		return a.m_key < b.m_key;
+	}
+};
+
+
+
+template<typename T>
+T* addByteOffset(void* baseAddr, u32 offset)
+{
+	return (T*)(((u32)baseAddr)+offset);
+}
+
+
+struct Pair32
+{
+	Pair32(){}
+	Pair32(u32 a, u32 b) : m_a(a), m_b(b){}
+
+	u32 m_a;
+	u32 m_b;
+};
+
+struct PtrPair
+{
+	PtrPair(){}
+	PtrPair(void* a, void* b) : m_a(a), m_b(b){}
+	template<typename T>
+	PtrPair(T* a, T* b) : m_a((void*)a), m_b((void*)b){}
+
+	void* m_a;
+	void* m_b;
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMatrix3x3.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMatrix3x3.h
@@ -0,0 +1,194 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef MATRIX3X3_H
+#define MATRIX3X3_H
+
+#include "AdlMath.h"
+
+///////////////////////////////////////
+//	Matrix3x3
+///////////////////////////////////////
+
+typedef 
+_MEM_CLASSALIGN16 struct
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	float4 m_row[3];
+}Matrix3x3;
+
+__inline
+Matrix3x3 mtZero();
+
+__inline
+Matrix3x3 mtIdentity();
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c);
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b);
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b);
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b);
+
+__inline
+float4 mtMul3(const float4& b, const Matrix3x3& a);
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtZero()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(0.f);
+	m.m_row[1] = make_float4(0.f);
+	m.m_row[2] = make_float4(0.f);
+	return m;
+}
+
+__inline
+Matrix3x3 mtIdentity()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(1,0,0);
+	m.m_row[1] = make_float4(0,1,0);
+	m.m_row[2] = make_float4(0,0,1);
+	return m;
+}
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c)
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(a,0,0);
+	m.m_row[1] = make_float4(0,b,0);
+	m.m_row[2] = make_float4(0,0,c);
+	return m;
+}
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = make_float4(m.m_row[0].s[0], m.m_row[1].s[0], m.m_row[2].s[0], 0.f);
+	out.m_row[1] = make_float4(m.m_row[0].s[1], m.m_row[1].s[1], m.m_row[2].s[1], 0.f);
+	out.m_row[2] = make_float4(m.m_row[0].s[2], m.m_row[1].s[2], m.m_row[2].s[2], 0.f);
+	return out;
+}
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	for(int i=0; i<3; i++)
+	{
+		ans.m_row[i].s[0] = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].s[1] = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].s[2] = dot3F4(a.m_row[i],transB.m_row[2]);
+	}
+	return ans;
+}
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b)
+{
+	float4 ans;
+	ans.s[0] = dot3F4( a.m_row[0], b );
+	ans.s[1] = dot3F4( a.m_row[1], b );
+	ans.s[2] = dot3F4( a.m_row[2], b );
+	return ans;
+}
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b)
+{
+	Matrix3x3 ans;
+	ans.m_row[0] = a*b.m_row[0];
+	ans.m_row[1] = a*b.m_row[1];
+	ans.m_row[2] = a*b.m_row[2];
+	return ans;
+}
+
+__inline
+float4 mtMul3(const float4& a, const Matrix3x3& b)
+{
+	float4 ans;
+	ans.x = a.x*b.m_row[0].x + a.y*b.m_row[1].x + a.z*b.m_row[2].x;
+	ans.y = a.x*b.m_row[0].y + a.y*b.m_row[1].y + a.z*b.m_row[2].y;
+	ans.z = a.x*b.m_row[0].z + a.y*b.m_row[1].z + a.z*b.m_row[2].z;
+	return ans;
+}
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m)
+{
+	float det = m.m_row[0].s[0]*m.m_row[1].s[1]*m.m_row[2].s[2]+m.m_row[1].s[0]*m.m_row[2].s[1]*m.m_row[0].s[2]+m.m_row[2].s[0]*m.m_row[0].s[1]*m.m_row[1].s[2]
+	-m.m_row[0].s[0]*m.m_row[2].s[1]*m.m_row[1].s[2]-m.m_row[2].s[0]*m.m_row[1].s[1]*m.m_row[0].s[2]-m.m_row[1].s[0]*m.m_row[0].s[1]*m.m_row[2].s[2];
+
+	CLASSERT( det );
+
+	Matrix3x3 ans;
+	ans.m_row[0].s[0] = m.m_row[1].s[1]*m.m_row[2].s[2] - m.m_row[1].s[2]*m.m_row[2].s[1];
+	ans.m_row[0].s[1] = m.m_row[0].s[2]*m.m_row[2].s[1] - m.m_row[0].s[1]*m.m_row[2].s[2];
+	ans.m_row[0].s[2] = m.m_row[0].s[1]*m.m_row[1].s[2] - m.m_row[0].s[2]*m.m_row[1].s[1];
+	ans.m_row[0].w = 0.f;
+
+	ans.m_row[1].s[0] = m.m_row[1].s[2]*m.m_row[2].s[0] - m.m_row[1].s[0]*m.m_row[2].s[2];
+	ans.m_row[1].s[1] = m.m_row[0].s[0]*m.m_row[2].s[2] - m.m_row[0].s[2]*m.m_row[2].s[0];
+	ans.m_row[1].s[2] = m.m_row[0].s[2]*m.m_row[1].s[0] - m.m_row[0].s[0]*m.m_row[1].s[2];
+	ans.m_row[1].w = 0.f;
+
+	ans.m_row[2].s[0] = m.m_row[1].s[0]*m.m_row[2].s[1] - m.m_row[1].s[1]*m.m_row[2].s[0];
+	ans.m_row[2].s[1] = m.m_row[0].s[1]*m.m_row[2].s[0] - m.m_row[0].s[0]*m.m_row[2].s[1];
+	ans.m_row[2].s[2] = m.m_row[0].s[0]*m.m_row[1].s[1] - m.m_row[0].s[1]*m.m_row[1].s[0];
+	ans.m_row[2].w = 0.f;
+
+	ans = mtMul2((1.0f/det), ans);
+	return ans;
+}
+
+__inline
+Matrix3x3 mtSet( const float4& a, const float4& b, const float4& c )
+{
+	Matrix3x3 m;
+	m.m_row[0] = a;
+	m.m_row[1] = b;
+	m.m_row[2] = c;
+	return m;
+}
+
+__inline
+Matrix3x3 operator+(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 out;
+	out.m_row[0] = a.m_row[0] + b.m_row[0];
+	out.m_row[1] = a.m_row[1] + b.m_row[1];
+	out.m_row[2] = a.m_row[2] + b.m_row[2];
+	return out;
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlQuaternion.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlQuaternion.h
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef QUATERNION_H
+#define QUATERNION_H
+
+#include "AdlMatrix3x3.h"
+
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtSet(const float4& axis, float angle);
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b);
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec);
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec);
+
+__inline
+Quaternion qtInvert(const Quaternion& q);
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat);
+
+__inline
+Quaternion qtNormalize(const Quaternion& q);
+
+__inline
+Quaternion qtGetIdentity() { return make_float4(0,0,0,1); }
+
+__inline
+Quaternion qtSet(const float4& axis, float angle)
+{
+	float4 nAxis = normalize3( axis );
+
+	Quaternion q;
+	q.s[0] = nAxis.s[0]*sin(angle/2);
+	q.s[1] = nAxis.s[1]*sin(angle/2);
+	q.s[2] = nAxis.s[2]*sin(angle/2);
+	q.s[3] = cos(angle/2);
+	return q;
+}
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.s[3]*b + b.s[3]*a;
+	ans.s[3] = a.s[3]*b.s[3] - (a.s[0]*b.s[0]+a.s[1]*b.s[1]+a.s[2]*b.s[2]);
+	return ans;
+}
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec)
+{
+	Quaternion vecQ = vec;
+	vecQ.s[3] = 0.f;
+	Quaternion qInv = qtInvert( q );
+	float4 out = qtMul(qtMul(q,vecQ),qInv);
+	return out;
+}
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+Quaternion qtInvert(const Quaternion& q)
+{
+	Quaternion ans;
+	ans.s[0] = -q.s[0];
+	ans.s[1] = -q.s[1];
+	ans.s[2] = -q.s[2];
+	ans.s[3] = q.s[3];
+	return ans;
+}
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat)
+{
+	float4 quat2 = make_float4(quat.s[0]*quat.s[0], quat.s[1]*quat.s[1], quat.s[2]*quat.s[2], 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].s[0]=1-2*quat2.s[1]-2*quat2.s[2];
+	out.m_row[0].s[1]=2*quat.s[0]*quat.s[1]-2*quat.s[3]*quat.s[2];
+	out.m_row[0].s[2]=2*quat.s[0]*quat.s[2]+2*quat.s[3]*quat.s[1];
+	out.m_row[0].s[3] = 0.f;
+
+	out.m_row[1].s[0]=2*quat.s[0]*quat.s[1]+2*quat.s[3]*quat.s[2];
+	out.m_row[1].s[1]=1-2*quat2.s[0]-2*quat2.s[2];
+	out.m_row[1].s[2]=2*quat.s[1]*quat.s[2]-2*quat.s[3]*quat.s[0];
+	out.m_row[1].s[3] = 0.f;
+
+	out.m_row[2].s[0]=2*quat.s[0]*quat.s[2]-2*quat.s[3]*quat.s[1];
+	out.m_row[2].s[1]=2*quat.s[1]*quat.s[2]+2*quat.s[3]*quat.s[0];
+	out.m_row[2].s[2]=1-2*quat2.s[0]-2*quat2.s[1];
+	out.m_row[2].s[3] = 0.f;
+
+	return out;
+}
+
+__inline
+Quaternion qtGetQuaternion(const Matrix3x3* m)
+{
+	Quaternion q;
+	q.w = sqrtf( m[0].m_row[0].x + m[0].m_row[1].y + m[0].m_row[2].z + 1 ) * 0.5f;
+	float inv4w = 1.f/(4.f*q.w);
+	q.x = (m[0].m_row[2].y-m[0].m_row[1].z)*inv4w;
+	q.y = (m[0].m_row[0].z-m[0].m_row[2].x)*inv4w;
+	q.z = (m[0].m_row[1].x-m[0].m_row[0].y)*inv4w;
+
+	return q;
+}
+
+__inline
+Quaternion qtNormalize(const Quaternion& q)
+{
+	return normalize4(q);
+}
+
+__inline
+float4 transform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( orientation, p ) + translation;
+}
+
+__inline
+float4 invTransform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( qtInvert( orientation ), p-translation ); // use qtInvRotate
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlRigidBody.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlRigidBody.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_RIGID_BODY_H
+#define ADL_RIGID_BODY_H
+
+#include "AdlQuaternion.h"
+
+class RigidBodyBase
+{
+	public:
+
+		_MEM_CLASSALIGN16
+		struct Body
+		{
+			_MEM_ALIGNED_ALLOCATOR16;
+
+			float4 m_pos;
+			Quaternion m_quat;
+			float4 m_linVel;
+			float4 m_angVel;
+
+			u32 m_shapeIdx;
+			u32 m_shapeType;
+
+			float m_invMass;
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+			
+		};
+
+		struct Inertia
+		{
+/*			u16 m_shapeType;
+			u16 m_shapeIdx;
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+			int m_padding;
+*/
+			Matrix3x3 m_invInertia;
+			Matrix3x3 m_initInvInertia;
+		};
+};
+
+#endif// ADL_RIGID_BODY_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlTransform.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlTransform.h
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef _ADL_TRANSFORM_H
+#define _ADL_TRANSFORM_H
+
+#include "AdlMath.h"
+#include "AdlQuaternion.h"
+#include "AdlMatrix3x3.h"
+
+struct Transform
+{
+	float4 m_translation;
+	Matrix3x3 m_rotation;
+};
+
+Transform trSetTransform(const float4& translation, const Quaternion& quat)
+{
+	Transform tr;
+	tr.m_translation = translation;
+	tr.m_rotation = qtGetRotationMatrix( quat );
+	return tr;
+}
+
+Transform trInvert( const Transform& tr )
+{
+	Transform ans;
+	ans.m_rotation = mtTranspose( tr.m_rotation );
+	ans.m_translation = mtMul1( ans.m_rotation, -tr.m_translation );
+	return ans;
+}
+
+Transform trMul(const Transform& trA, const Transform& trB)
+{
+	Transform ans; 
+	ans.m_rotation = mtMul( trA.m_rotation, trB.m_rotation );
+	ans.m_translation = mtMul1( trA.m_rotation, trB.m_translation ) + trA.m_translation;
+	return ans;
+}
+
+float4 trMul1(const Transform& tr, const float4& p)
+{
+	return mtMul1( tr.m_rotation, p ) + tr.m_translation;
+}
+
+
+#endif //_ADL_TRANSFORM_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4.inl
@@ -0,0 +1,373 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.x = x; v.y = y; v.z = z; v.w = w;
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 out;
+	out.s[0] = a.s[0]*b.s[0];
+	out.s[1] = a.s[1]*b.s[1];
+	out.s[2] = a.s[2]*b.s[2];
+	out.s[3] = a.s[3]*b.s[3];
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b.s[0];
+	a.s[1]*=b.s[1];
+	a.s[2]*=b.s[2];
+	a.s[3]*=b.s[3];
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b;
+	a.s[1]*=b;
+	a.s[2]*=b;
+	a.s[3]*=b;
+}
+
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]/b.s[0];
+	out.s[1] = a.s[1]/b.s[1];
+	out.s[2] = a.s[2]/b.s[2];
+	out.s[3] = a.s[3]/b.s[3];
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(b.s[0]/a, b.s[1]/a, b.s[2]/a, b.s[3]/a);
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a.s[0]/=b.s[0];
+	a.s[1]/=b.s[1];
+	a.s[2]/=b.s[2];
+	a.s[3]/=b.s[3];
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	a.s[0]/=b;
+	a.s[1]/=b;
+	a.s[2]/=b;
+	a.s[3]/=b;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b.s[0];
+	out.s[1] = a.s[1]+b.s[1];
+	out.s[2] = a.s[2]+b.s[2];
+	out.s[3] = a.s[3]+b.s[3];
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b;
+	out.s[1] = a.s[1]+b;
+	out.s[2] = a.s[2]+b;
+	out.s[3] = a.s[3]+b;
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b.s[0];
+	out.s[1] = a.s[1]-b.s[1];
+	out.s[2] = a.s[2]-b.s[2];
+	out.s[3] = a.s[3]-b.s[3];
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b;
+	out.s[1] = a.s[1]-b;
+	out.s[2] = a.s[2]-b;
+	out.s[3] = a.s[3]-b;
+	return out;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b.s[0];
+	a.s[1]+=b.s[1];
+	a.s[2]+=b.s[2];
+	a.s[3]+=b.s[3];
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b;
+	a.s[1]+=b;
+	a.s[2]+=b;
+	a.s[3]+=b;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b.s[0];
+	a.s[1]-=b.s[1];
+	a.s[2]-=b.s[2];
+	a.s[3]-=b.s[3];
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b;
+	a.s[1]-=b;
+	a.s[2]-=b;
+	a.s[3]-=b;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{
+	return make_float4(a.s[1]*b.s[2]-a.s[2]*b.s[1], 
+		a.s[2]*b.s[0]-a.s[0]*b.s[2], 
+		a.s[0]*b.s[1]-a.s[1]*b.s[0], 
+		0);
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4SSE.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4SSE.inl
@@ -0,0 +1,381 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.m_quad = _mm_set_ps(w,z,y,x);
+
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	float4 zero; zero.m_quad = _mm_setzero_ps();
+	float4 ans; ans.m_quad = _mm_sub_ps( zero.m_quad, a.m_quad );
+	return ans;
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_mul_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	return av*b;
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	return av*b;
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a*b;
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	a = a*bv;
+}
+
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_div_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	float4 out;
+	out = b/av;
+	return out;
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a = a/b;
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	a = a/bv;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_add_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	return a+bv;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_sub_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	return a-bv;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a + b;
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+
+	a = a + bv;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a - b;
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+
+	a = a - bv;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{	//	xnamathvector.inl
+	union IntVec
+	{
+		unsigned int m_i[4];
+		__m128 m_v;
+	};
+
+	IntVec mask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
+	__m128 V1 = a.m_quad;
+	__m128 V2 = b.m_quad;
+
+    __m128 vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(3,0,2,1));
+    // z2,x2,y2,w2
+    __m128 vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the left operation
+    __m128 vResult = _mm_mul_ps(vTemp1,vTemp2);
+    // z1,x1,y1,w1
+    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp1,_MM_SHUFFLE(3,0,2,1));
+    // y2,z2,x2,w2
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the right operation
+    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
+    // Subract the right from left, and return answer
+    vResult = _mm_sub_ps(vResult,vTemp1);
+    // Set w to zero
+	float4 ans; ans.m_quad = _mm_and_ps(vResult,mask3.m_v);
+	return ans;
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+//	return a.x*b.x+a.y*b.y+a.z*b.z;
+    // Perform the dot product
+	__m128 V1 = a.m_quad;
+	__m128 V2 = b.m_quad;
+
+	__m128 vDot = _mm_mul_ps(V1,V2);
+    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
+    __m128 vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.vector4_f32[0] = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.vector4_f32[2]
+    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.vector4_f32[0] = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+	float4 ans; ans.m_quad = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+	return ans.x;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowPhase.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowPhase.h
@@ -0,0 +1,154 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+//#include <Common/Base/SyncObjects.h>
+
+#include "AdlMath.h"
+#include "AdlContact4.h"
+#include "AdlRigidBody.h"
+
+#include "../ConvexHeightFieldShape.h"
+
+//#include "TypeDefinition.h"
+//#include "RigidBody.h"
+//#include "ConvexHeightFieldShape.h"
+
+namespace adl
+{
+class ShapeBase;
+
+class ChNarrowphaseBase
+{
+	public:
+		struct Config
+		{
+			float m_collisionMargin;
+		};
+/*
+		typedef struct
+		{
+			//	m_normal.w == height in u8
+			float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];
+
+			float m_scale;
+			float m_padding0;
+			float m_padding1;
+			float m_padding2;
+		} ShapeData;
+*/
+};
+
+template<DeviceType TYPE>
+class ChNarrowphase : public ChNarrowphaseBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_supportCullingKernel;
+			Kernel* m_narrowphaseKernel;
+			Kernel* m_narrowphaseWithPlaneKernel;
+
+			Buffer<u32>* m_counterBuffer;
+		};
+
+		enum
+		{
+			N_TASKS = 4,
+			HEIGHT_RES = ConvexHeightField::HEIGHT_RES,
+		};
+
+		struct ShapeData
+		{
+			float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_supportHeight4[HEIGHT_RES*HEIGHT_RES*6];
+
+			float m_scale;
+			float m_padding0;
+			float m_padding1;
+			float m_padding2;
+		};
+
+		struct ConstData
+		{
+			int m_nPairs;
+			float m_collisionMargin;
+			int m_capacity;
+			int m_paddings[1];
+		};
+		
+		static
+		Data* allocate( const Device* device );
+
+		static
+		void deallocate( Data* data );
+/*
+		static
+		Buffer<ShapeData>* allocateShapeBuffer( const Device* device, int capacity );
+
+		static
+		void deallocateShapeBuffer( Buffer<ShapeData>* shapeBuf );
+
+		static
+		void setShape( Buffer<ShapeData>* shapeBuf, ShapeBase* shape, int idx, float collisionMargin );
+*/
+		static
+		ShapeDataType allocateShapeBuffer( const Device* device, int capacity );
+
+		static
+		void deallocateShapeBuffer( ShapeDataType shapeBuf );
+
+		static
+		void setShape( ShapeDataType shapeBuf, ShapeBase* shape, int idx, float collisionMargin = 0.f );
+		
+		static
+		void setShape( ShapeDataType shapeBuf, ConvexHeightField* cvxShape, int idx, float collisionMargin = 0.f );
+
+		// Run NarrowphaseKernel
+		//template<bool USE_OMP>
+		static
+		void execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg );
+
+		// Run NarrowphaseWithPlaneKernel
+		//template<bool USE_OMP>
+		static
+		void execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			const Buffer<float4>* vtxBuf, const Buffer<int4>* idxBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg );
+
+		// Run SupportCullingKernel
+		//template<bool USE_OMP>
+		static
+		int culling( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf, const Buffer<int2>* pairsOut, const Config& cfg );
+};
+
+//#include <AdlPhysics/Narrowphase/ChNarrowphase.inl>
+//#include <AdlPhysics/Narrowphase/ChNarrowphaseHost.inl>
+
+#include "ChNarrowphase.inl"
+
+};
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphase.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphase.inl
@@ -0,0 +1,303 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\ChNarrowphaseKernels"
+#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\ChNarrowphaseKernels"
+#define KERNEL0 "SupportCullingKernel"
+#define KERNEL1 "NarrowphaseKernel"
+
+#include "ChNarrowphaseKernels.h"
+
+class ChNarrowphaseImp
+{
+public:
+	static
+	__inline
+	u32 u32Pack(u8 x, u8 y, u8 z, u8 w)
+	{
+		return (x) | (y<<8) | (z<<16) | (w<<24);
+	}
+
+};
+
+template<DeviceType TYPE>
+typename ChNarrowphase<TYPE>::Data* ChNarrowphase<TYPE>::allocate( const Device* device )
+{
+	char options[100];
+	
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{narrowphaseKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+	
+
+
+	//sprintf(options, "-I ..\\..\\ -Wf,--c++");
+	sprintf(options, "-I .\\NarrowPhaseCL\\");
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_supportCullingKernel = device->getKernel( PATH, KERNEL0, options,src[TYPE] );
+	data->m_narrowphaseKernel = device->getKernel( PATH, KERNEL1, options, src[TYPE]);
+	data->m_narrowphaseWithPlaneKernel = device->getKernel( PATH, "NarrowphaseWithPlaneKernel", options,src[TYPE]);
+	data->m_counterBuffer = new Buffer<u32>( device, 1 );
+
+	return data;
+}
+
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::deallocate( Data* data )
+{
+	delete data->m_counterBuffer;
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+ShapeDataType ChNarrowphase<TYPE>::allocateShapeBuffer( const Device* device, int capacity )
+{
+	ADLASSERT( device->m_type == TYPE );
+
+	return new Buffer<ShapeData>( device, capacity );
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::deallocateShapeBuffer( ShapeDataType shapeBuf )
+{
+	Buffer<ShapeData>* s = (Buffer<ShapeData>*)shapeBuf;
+	delete s;
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::setShape( ShapeDataType shapeBuf, ShapeBase* shape, int idx, float collisionMargin )
+{
+	ConvexHeightField* cvxShape = new ConvexHeightField( shape );
+	Buffer<ShapeData>* dst = (Buffer<ShapeData>*)shapeBuf;
+	cvxShape->m_aabb.expandBy( make_float4( collisionMargin ) );
+	{
+		ShapeData s;
+		{
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6; j++)
+			{
+				s.m_normal[j] = cvxShape->m_normal[j];
+			}
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6/4; j++)
+			{
+				s.m_height4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_data[4*j], cvxShape->m_data[4*j+1], cvxShape->m_data[4*j+2], cvxShape->m_data[4*j+3] );
+				s.m_supportHeight4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_supportHeight[4*j], cvxShape->m_supportHeight[4*j+1], cvxShape->m_supportHeight[4*j+2], cvxShape->m_supportHeight[4*j+3] );
+			}
+			s.m_scale = cvxShape->m_scale;
+		}
+		dst->write( &s, 1, idx );
+		DeviceUtils::waitForCompletion( dst->m_device );
+	}
+	delete cvxShape;
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::setShape( ShapeDataType shapeBuf, ConvexHeightField* cvxShape, int idx, float collisionMargin )
+{
+	Buffer<ShapeData>* dst = (Buffer<ShapeData>*)shapeBuf;
+	cvxShape->m_aabb.expandBy( make_float4( collisionMargin ) );
+	{
+		ShapeData s;
+		{
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6; j++)
+			{
+				s.m_normal[j] = cvxShape->m_normal[j];
+			}
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6/4; j++)
+			{
+				s.m_height4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_data[4*j], cvxShape->m_data[4*j+1], cvxShape->m_data[4*j+2], cvxShape->m_data[4*j+3] );
+				s.m_supportHeight4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_supportHeight[4*j], cvxShape->m_supportHeight[4*j+1], cvxShape->m_supportHeight[4*j+2], cvxShape->m_supportHeight[4*j+3] );
+			}
+			s.m_scale = cvxShape->m_scale;
+		}
+		dst->write( &s, 1, idx );
+		DeviceUtils::waitForCompletion( dst->m_device );
+	}
+}
+
+// Run NarrowphaseKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+void ChNarrowphase<TYPE>::execute( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg )
+{
+	if( nPairs == 0 ) return;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+	Buffer<Contact4>* gContactOutNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, contactOut );	//	this might not be empty
+
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = contactOut->getSize() - nContacts;
+
+	u32 n = nContacts;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gContactOutNative ),
+			BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_narrowphaseKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs*64, 64 );
+	}
+
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gContactOutNative, contactOut );
+
+	nContacts = min2((int)n, contactOut->getSize() );
+}
+
+// Run NarrowphaseWithPlaneKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+void ChNarrowphase<TYPE>::execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			const Buffer<float4>* vtxBuf, const Buffer<int4>* idxBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg )
+{
+	if( nPairs == 0 ) return;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );	
+	Buffer<Contact4>* gContactOutNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, contactOut );	//	this might not be empty
+
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = contactOut->getSize() - nContacts;
+
+	u32 n = nContacts;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gContactOutNative ),
+			BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_narrowphaseWithPlaneKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs*64, 64 );
+	}
+
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gContactOutNative, contactOut );
+
+	nContacts = min2((int)n, contactOut->getSize() );
+}
+
+// Run SupportCullingKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+int ChNarrowphase<TYPE>::culling( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf, const Buffer<int2>* pairsOut, const Config& cfg )
+{
+	if( nPairs == 0 ) return 0;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );	
+	Buffer<int2>* gPairsOutNative 
+		= BufferUtils::map<TYPE, false>( data->m_device, pairsOut );
+
+	//
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = pairsOut->getSize();
+
+	u32 n = 0;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gPairsOutNative ), BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_supportCullingKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs, 64 );
+	}
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+/*
+	if( gPairsInNative != pairs ) delete gPairsInNative;
+	if( gBodyInNative != bodyBuf ) delete gBodyInNative;
+	if( gPairsOutNative != pairsOut ) 
+	{
+		gPairsOutNative->read( pairsOut->m_ptr, n );
+		DeviceUtils::waitForCompletion( device );
+		delete gPairsOutNative;
+	}
+*/
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gPairsOutNative, pairsOut );
+
+	return min2((int)n, pairsOut->getSize() );
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.cl
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.h
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.h
@@ -0,0 +1,203 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+#ifndef __ADL_SOLVER_H
+#define __ADL_SOLVER_H
+
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Search/BoundSearch.h>
+#include <AdlPrimitives/Sort/RadixSort.h>
+#include <AdlPrimitives/Scan/PrefixScan.h>
+#include <AdlPrimitives/Sort/RadixSort32.h>
+
+//#include <AdlPhysics/TypeDefinition.h>
+#include "AdlRigidBody.h"
+#include "AdlContact4.h"
+
+//#include "AdlPhysics/Batching/Batching.h>
+
+
+#define MYF4 float4
+#define MAKE_MYF4 make_float4
+
+//#define MYF4 float4sse
+//#define MAKE_MYF4 make_float4sse
+
+#include "AdlConstraint4.h"
+
+namespace adl
+{
+class SolverBase
+{
+	public:
+		
+
+		struct ConstraintData
+		{
+			ConstraintData(): m_b(0.f), m_appliedRambdaDt(0.f) {}
+
+			float4 m_linear; // have to be normalized
+			float4 m_angular0;
+			float4 m_angular1;
+			float m_jacCoeffInv;
+			float m_b;
+			float m_appliedRambdaDt;
+
+			u32 m_bodyAPtr;
+			u32 m_bodyBPtr;
+
+			bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+			float getFrictionCoeff() const { return m_linear.w; }
+			void setFrictionCoeff(float coeff) { m_linear.w = coeff; }
+		};
+
+		struct ConstraintCfg
+		{
+			ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {}
+
+			float m_positionDrift;
+			float m_positionConstraintCoeff;
+			float m_dt;
+			bool m_enableParallelSolve;
+			float m_averageExtent;
+			int m_staticIdx;
+		};
+
+		static
+		__inline
+		Buffer<Contact4>* allocateContact4( const Device* device, int capacity )
+		{
+			return new Buffer<Contact4>( device, capacity );	
+		}
+
+		static
+		__inline
+		void deallocateContact4( Buffer<Contact4>* data ) { delete data; }
+
+		static
+		__inline
+		SolverData allocateConstraint4( const Device* device, int capacity )
+		{
+			return new Buffer<Constraint4>( device, capacity );
+		}
+
+		static
+		__inline
+		void deallocateConstraint4( SolverData data ) { delete (Buffer<Constraint4>*)data; }
+
+		static
+		__inline
+		void* allocateFrictionConstraint( const Device* device, int capacity, u32 type = 0 )
+		{
+			return 0;
+		}
+
+		static
+		__inline
+		void deallocateFrictionConstraint( void* data ) 
+		{
+		}
+
+		enum
+		{
+			N_SPLIT = 16,
+			N_BATCHES = 4,
+			N_OBJ_PER_SPLIT = 10,
+			N_TASKS_PER_BATCH = N_SPLIT*N_SPLIT,
+		};
+};
+
+template<DeviceType TYPE>
+class Solver : public SolverBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			Data() : m_nIterations(4){}
+
+			const Device* m_device;
+			void* m_parallelSolveData;
+			int m_nIterations;
+			Kernel* m_batchingKernel;
+			Kernel* m_batchSolveKernel;
+			Kernel* m_contactToConstraintKernel;
+			Kernel* m_setSortDataKernel;
+			Kernel* m_reorderContactKernel;
+			Kernel* m_copyConstraintKernel;
+			//typename RadixSort<TYPE>::Data* m_sort;
+			typename RadixSort32<TYPE>::Data* m_sort32;
+			typename BoundSearch<TYPE>::Data* m_search;
+			typename PrefixScan<TYPE>::Data* m_scan;
+			Buffer<SortData>* m_sortDataBuffer;
+			Buffer<Contact4>* m_contactBuffer;
+		};
+
+		enum
+		{
+			DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
+		};
+
+		static
+		Data* allocate( const Device* device, int pairCapacity );
+
+		static
+		void deallocate( Data* data );
+
+		static
+		void reorderConvertToConstraints( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+		const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void solveContactConstraint( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* inertiaBuf, 
+			SolverData constraint, void* additionalData, int n );
+
+//		static
+//		int createSolveTasks( int batchIdx, Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+//			SolverData constraint, int n, ThreadPool::Task* tasksOut[], int taskCapacity );
+
+
+		//private:
+		static
+		void convertToConstraints( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void sortContacts( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void batchContacts( Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx );
+
+};
+
+#include "Solver.inl"
+#include "SolverHost.inl"
+};
+
+#undef MYF4
+#undef MAKE_MYF4
+
+#endif //__ADL_SOLVER_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
@@ -0,0 +1,762 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\SolverKernels"
+#define BATCHING_PATH "..\\..\\dynamics\\basic_demo\\Stubs\\batchingKernels"
+
+#define KERNEL1 "SingleBatchSolveKernel"
+#define KERNEL2 "BatchSolveKernel"
+
+#define KERNEL3 "ContactToConstraintKernel"
+#define KERNEL4 "SetSortDataKernel"
+#define KERNEL5 "ReorderContactKernel"
+#include "SolverKernels.h"
+
+#include "batchingKernels.h"
+
+
+struct SolverDebugInfo
+{
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	
+	int m_valInt4;
+	int m_valInt5;
+	int m_valInt6;
+	int m_valInt7;
+
+	int m_valInt8;
+	int m_valInt9;
+	int m_valInt10;
+	int m_valInt11;
+
+	int	m_valInt12;
+	int	m_valInt13;
+	int	m_valInt14;
+	int	m_valInt15;
+
+
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+};
+
+
+
+
+class SolverDeviceInl
+{
+public:
+	struct ParallelSolveData
+	{
+		Buffer<u32>* m_numConstraints;
+		Buffer<u32>* m_offsets;
+	};
+};
+
+template<DeviceType TYPE>
+typename Solver<TYPE>::Data* Solver<TYPE>::allocate( const Device* device, int pairCapacity )
+{
+		const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{solverKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+		const char* src2[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{batchingKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+
+	
+
+	Data* data = new Data;
+	data->m_device = device;
+	bool cacheBatchingKernel = true;
+	data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", src2[TYPE],cacheBatchingKernel);
+	//data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", 0,cacheBatchingKernel);
+	bool cacheSolverKernel  = true;
+
+	data->m_batchSolveKernel = device->getKernel( PATH, KERNEL2, "-I ..\\..\\ ", src[TYPE],cacheSolverKernel );
+	data->m_contactToConstraintKernel = device->getKernel( PATH, KERNEL3, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_setSortDataKernel = device->getKernel( PATH, KERNEL4, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_reorderContactKernel = device->getKernel( PATH, KERNEL5, 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_copyConstraintKernel = device->getKernel( PATH, "CopyConstraintKernel", 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_parallelSolveData = new SolverDeviceInl::ParallelSolveData;
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		solveData->m_numConstraints = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+		solveData->m_offsets = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+	}
+	const int sortSize = NEXTMULTIPLEOF( pairCapacity, 512 );
+
+
+	//data->m_sort = RadixSort<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	data->m_sort32 = RadixSort32<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	
+	data->m_search = BoundSearch<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+	data->m_scan = PrefixScan<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+
+	data->m_sortDataBuffer = new Buffer<SortData>( data->m_device, sortSize );
+
+	if( pairCapacity < DYNAMIC_CONTACT_ALLOCATION_THRESHOLD )
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, pairCapacity );
+	else
+		data->m_contactBuffer = 0;
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::deallocate( Data* data )
+{
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		delete solveData->m_numConstraints;
+		delete solveData->m_offsets;
+		delete solveData;
+	}
+
+//	RadixSort<TYPE>::deallocate( data->m_sort );
+	RadixSort32<TYPE>::deallocate(data->m_sort32);
+	BoundSearch<TYPE>::deallocate( data->m_search );
+	PrefixScan<TYPE>::deallocate( data->m_scan );
+
+	delete data->m_sortDataBuffer;
+	if( data->m_contactBuffer ) delete data->m_contactBuffer;
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::reorderConvertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf,
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	if( data->m_contactBuffer )
+	{
+		if( data->m_contactBuffer->getSize() < nContacts )
+		{
+			BT_PROFILE("delete data->m_contactBuffer;");
+			delete data->m_contactBuffer;
+			data->m_contactBuffer = 0;
+		}
+	}
+	if( data->m_contactBuffer == 0 )
+	{
+		BT_PROFILE("new data->m_contactBuffer;");
+
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, nContacts );
+	}
+	Stopwatch sw;
+
+	Buffer<Contact4>* contactNative = BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn, nContacts );
+
+	//DeviceUtils::Config dhCfg;
+	//Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		DeviceUtils::waitForCompletion( data->m_device );
+		sw.start();
+		//	contactsIn -> data->m_contactBuffer
+		{
+			BT_PROFILE("sortContacts");
+			Solver<TYPE>::sortContacts( data, bodyBuf, contactNative, additionalData, nContacts, cfg );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		sw.split();
+		if(0)
+		{
+			Contact4* tmp = new Contact4[nContacts];
+			data->m_contactBuffer->read( tmp, nContacts );
+			DeviceUtils::waitForCompletion( data->m_contactBuffer->m_device );
+			contactNative->write( tmp, nContacts );
+			DeviceUtils::waitForCompletion( contactNative->m_device );
+			delete [] tmp;
+		}
+		else
+		{
+			BT_PROFILE("m_copyConstraintKernel");
+
+			Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+			int4 cdata; cdata.x = nContacts;
+			BufferInfo bInfo[] = { BufferInfo( data->m_contactBuffer ), BufferInfo( contactNative ) };
+//			Launcher launcher( data->m_device, data->m_device->getKernel( PATH, "CopyConstraintKernel",  "-I ..\\..\\ -Wf,--c++", 0 ) );
+			Launcher launcher( data->m_device, data->m_copyConstraintKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( nContacts, 64 );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		{
+			BT_PROFILE("batchContacts");
+			Solver<TYPE>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, cfg.m_staticIdx );
+
+		}
+	}
+	{
+			BT_PROFILE("waitForCompletion (batchContacts)");
+			DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.split();
+	//================
+	if(0)
+	{
+//		Solver<TYPE_HOST>::Data* solverHost = Solver<TYPE_HOST>::allocate( deviceHost, nContacts );
+//		Solver<TYPE_HOST>::convertToConstraints( solverHost, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+//		Solver<TYPE_HOST>::deallocate( solverHost );
+	}
+	else
+	{
+		BT_PROFILE("convertToConstraints");
+		Solver<TYPE>::convertToConstraints( data, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+	}
+	{
+		BT_PROFILE("convertToConstraints waitForCompletion");
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.stop();
+
+	{
+		BT_PROFILE("printf");
+
+		float t[5];
+		sw.getMs( t, 3 );
+//		printf("%3.2f, %3.2f, %3.2f, ", t[0], t[1], t[2]);
+	}
+
+	{
+		BT_PROFILE("deallocate and unmap");
+
+		//DeviceUtils::deallocate( deviceHost );
+
+		BufferUtils::unmap<true>( contactNative, contactsIn, nContacts );
+	}
+}
+
+
+template<DeviceType TYPE>
+void Solver<TYPE>::solveContactConstraint( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, void* additionalData, int n )
+{
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::solveContactConstraint( hostData, bodyBuf, shapeBuf, constraint, additionalData, n );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	ADLASSERT( data );
+
+	Buffer<Constraint4>* cBuffer =0;
+	
+	Buffer<RigidBodyBase::Body>* gBodyNative=0; 
+	Buffer<RigidBodyBase::Inertia>* gShapeNative =0;
+	Buffer<Constraint4>* gConstraintNative =0;
+	
+
+	{
+		BT_PROFILE("map");
+	cBuffer = (Buffer<Constraint4>*)constraint;
+
+		gBodyNative= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		gShapeNative= BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		gConstraintNative = BufferUtils::map<TYPE, true>( data->m_device, cBuffer );
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+
+	Buffer<int4> constBuffer;
+	int4 cdata = make_int4( n, 0, 0, 0 );
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		const int nn = N_SPLIT*N_SPLIT;
+
+		cdata.x = 0;
+		cdata.y = 250;
+
+#if 0
+//check how the cells are filled
+		unsigned int* hostCounts = new unsigned int[N_SPLIT*N_SPLIT];
+		solveData->m_numConstraints->read(hostCounts,N_SPLIT*N_SPLIT);
+		DeviceUtils::waitForCompletion( data->m_device );
+		for (int i=0;i<N_SPLIT*N_SPLIT;i++)
+		{
+			if (hostCounts[i])
+			{
+				printf("hostCounts[%d]=%d\n",i,hostCounts[i]);
+			}
+		}
+		delete[] hostCounts;
+#endif
+
+		int numWorkItems = 64*nn/N_BATCHES;
+#ifdef DEBUG_ME
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+#endif
+
+
+
+		{
+
+			BT_PROFILE("m_batchSolveKernel iterations");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+#ifdef DEBUG_ME
+					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+					gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+				
+
+					BufferInfo bInfo[] = { 
+
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets ) 
+#ifdef DEBUG_ME
+						,	BufferInfo(&gpuDebugInfo)
+#endif
+						};
+
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					
+					launcher.launch1D( numWorkItems, 64 );
+
+#ifdef DEBUG_ME
+					DeviceUtils::waitForCompletion( data->m_device );
+					gpuDebugInfo.read(debugInfo,numWorkItems);
+					DeviceUtils::waitForCompletion( data->m_device );
+					for (int i=0;i<numWorkItems;i++)
+					{
+						if (debugInfo[i].m_valInt2>0)
+						{
+							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+						}
+
+						if (debugInfo[i].m_valInt3>0)
+						{
+							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+						}
+					}
+#endif //DEBUG_ME
+
+
+				}
+			}
+		
+			DeviceUtils::waitForCompletion( data->m_device );
+
+
+		}
+
+		cdata.x = 1;
+		{
+			BT_PROFILE("m_batchSolveKernel iterations2");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+					BufferInfo bInfo[] = { 
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets )
+#ifdef DEBUG_ME
+						,BufferInfo(&gpuDebugInfo)
+#endif //DEBUG_ME
+					};
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					launcher.launch1D( 64*nn/N_BATCHES, 64 );
+				}
+			}
+			DeviceUtils::waitForCompletion( data->m_device );
+			
+		}
+#ifdef DEBUG_ME
+		delete[] debugInfo;
+#endif //DEBUG_ME
+	}
+
+	{
+		BT_PROFILE("unmap");
+	BufferUtils::unmap<true>( gBodyNative, bodyBuf );
+	BufferUtils::unmap<false>( gShapeNative, shapeBuf );
+	BufferUtils::unmap<true>( gConstraintNative, cBuffer );
+	DeviceUtils::waitForCompletion( data->m_device );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::convertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	Buffer<RigidBodyBase::Body>* bodyNative =0;
+	Buffer<RigidBodyBase::Inertia>* shapeNative =0;
+	Buffer<Contact4>* contactNative =0;
+	Buffer<Constraint4>* constraintNative =0;
+
+	{
+		BT_PROFILE("map buffers");
+
+		bodyNative = BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		shapeNative  = BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		contactNative= BufferUtils::map<TYPE, true>( data->m_device, contactsIn );
+		constraintNative = BufferUtils::map<TYPE, false>( data->m_device, (Buffer<Constraint4>*)contactCOut );
+	}
+	struct CB
+	{
+		int m_nContacts;
+		float m_dt;
+		float m_positionDrift;
+		float m_positionConstraintCoeff;
+	};
+
+	{
+		BT_PROFILE("m_contactToConstraintKernel");
+		CB cdata;
+		cdata.m_nContacts = nContacts;
+		cdata.m_dt = cfg.m_dt;
+		cdata.m_positionDrift = cfg.m_positionDrift;
+		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
+
+		Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( shapeNative ),
+			BufferInfo( constraintNative )};
+		Launcher launcher( data->m_device, data->m_contactToConstraintKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nContacts, 64 );	
+		DeviceUtils::waitForCompletion( data->m_device );
+
+	}
+
+	{
+		BT_PROFILE("unmap");
+		BufferUtils::unmap<false>( bodyNative, bodyBuf );
+		BufferUtils::unmap<false>( shapeNative, shapeBuf );
+		BufferUtils::unmap<false>( contactNative, contactsIn );
+		BufferUtils::unmap<true>( constraintNative, (Buffer<Constraint4>*)contactCOut );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::sortContacts( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+	Buffer<RigidBodyBase::Body>* bodyNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, bodyBuf );
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn );
+
+	const int sortAlignment = 512; // todo. get this out of sort
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
+
+		Buffer<u32>* countsNative = nativeSolveData->m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
+		Buffer<u32>* offsetsNative = nativeSolveData->m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
+
+		{	//	2. set cell idx
+			struct CB
+			{
+				int m_nContacts;
+				int m_staticIdx;
+				float m_scale;
+				int m_nSplit;
+			};
+
+			ADLASSERT( sortSize%64 == 0 );
+			CB cdata;
+			cdata.m_nContacts = nContacts;
+			cdata.m_staticIdx = cfg.m_staticIdx;
+			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
+			cdata.m_nSplit = N_SPLIT;
+
+			Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+			BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( data->m_sortDataBuffer ) };
+			Launcher launcher( data->m_device, data->m_setSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( sortSize, 64 );
+		}
+
+		{	//	3. sort by cell idx
+			int n = N_SPLIT*N_SPLIT;
+			int sortBit = 32;
+			//if( n <= 0xffff ) sortBit = 16;
+			//if( n <= 0xff ) sortBit = 8;
+			RadixSort32<TYPE>::execute( data->m_sort32, *data->m_sortDataBuffer,sortSize);
+		}
+		{	//	4. find entries
+			BoundSearch<TYPE>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, BoundSearchBase::COUNT );
+
+			PrefixScan<TYPE>::execute( data->m_scan, *countsNative, *offsetsNative, N_SPLIT*N_SPLIT );
+		}
+
+		{	//	5. sort constraints by cellIdx
+			//	todo. preallocate this
+//			ADLASSERT( contactsIn->getType() == TYPE_HOST );
+//			Buffer<Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer
+
+			{
+				Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+				int4 cdata; cdata.x = nContacts;
+				BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( data->m_contactBuffer ), BufferInfo( data->m_sortDataBuffer ) };
+				Launcher launcher( data->m_device, data->m_reorderContactKernel );
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+				launcher.setConst( constBuffer, cdata );
+				launcher.launch1D( nContacts, 64 );
+			}
+//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( contactNative, contactsIn );
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::batchContacts( typename Solver<TYPE>::Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	if(0)
+	{
+		BT_PROFILE("CPU classTestKernel/Kernel (batch generation?)");
+
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::batchContacts( hostData, contacts, nContacts, n, offsets, staticIdx );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contacts, nContacts );
+	Buffer<u32>* nNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, n );
+	Buffer<u32>* offsetsNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, offsets );
+
+	{
+		BT_PROFILE("GPU classTestKernel/Kernel (batch generation?)");
+		Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		int4 cdata;
+		cdata.x = nContacts;
+		cdata.y = 0;
+		cdata.z = staticIdx;
+
+		int numWorkItems = 64*N_SPLIT*N_SPLIT;
+#ifdef BATCH_DEBUG
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+		gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+		BufferInfo bInfo[] = { 
+			BufferInfo( contactNative ), 
+			BufferInfo( data->m_contactBuffer ), 
+			BufferInfo( nNative ), 
+			BufferInfo( offsetsNative ) 
+#ifdef BATCH_DEBUG
+			,	BufferInfo(&gpuDebugInfo)
+#endif
+		};
+
+		
+		
+		Launcher launcher( data->m_device, data->m_batchingKernel);
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( numWorkItems, 64 );
+		DeviceUtils::waitForCompletion( data->m_device );
+
+#ifdef BATCH_DEBUG
+	aaaa
+		Contact4* hostContacts = new Contact4[nContacts];
+		data->m_contactBuffer->read(hostContacts,nContacts);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		gpuDebugInfo.read(debugInfo,numWorkItems);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		for (int i=0;i<numWorkItems;i++)
+		{
+			if (debugInfo[i].m_valInt1>0)
+			{
+				printf("catch\n");
+			}
+			if (debugInfo[i].m_valInt2>0)
+			{
+				printf("catch22\n");
+			}
+
+			if (debugInfo[i].m_valInt3>0)
+			{
+				printf("catch666\n");
+			}
+
+			if (debugInfo[i].m_valInt4>0)
+			{
+				printf("catch777\n");
+			}
+		}
+		delete[] debugInfo;
+#endif //BATCH_DEBUG
+
+	}
+
+	if(0)
+	{
+		u32* nhost = new u32[N_SPLIT*N_SPLIT];
+
+		nNative->read( nhost, N_SPLIT*N_SPLIT );
+
+		Contact4* chost = new Contact4[nContacts];
+		data->m_contactBuffer->read( chost, nContacts );
+		DeviceUtils::waitForCompletion( data->m_device );
+		printf(">>");
+		int nonzero = 0;
+		u32 maxn = 0;
+		for(int i=0; i<N_SPLIT*N_SPLIT; i++)
+		{
+			printf("%d-", nhost[i]);
+			nonzero += (nhost[i]==0)? 0:1;
+			maxn = max2( nhost[i], maxn );
+		}
+		printf("\nnonzero:zero = %d:%d (%d)\n", nonzero, N_SPLIT*N_SPLIT-nonzero, maxn);
+		printf("\n\n");
+
+		int prev = 0;
+		int prevIdx = 0;
+		int maxNBatches = 0;
+		for(int i=0; i<nContacts; i++)
+		{
+//			printf("(%d, %d:%d),", chost[i].m_batchIdx, chost[i].m_bodyAPtr, chost[i].m_bodyBPtr);
+			if( prev != 0 && chost[i].m_batchIdx == 0 )
+			{
+				maxNBatches = max2( maxNBatches, prev );
+				printf("\n[%d]", prev);
+
+				//for(int j=prevIdx; j<i; j++)
+				//{
+				//	printf("(%d:%d),", chost[j].m_bodyAPtr, chost[j].m_bodyBPtr);
+				//}
+
+				//printf("\n");
+
+				prevIdx = i;
+			}
+
+			printf("%d,", chost[i].m_batchIdx);
+
+			prev = chost[i].m_batchIdx;
+		}
+		printf("\n");
+		printf("Max: %d\n", maxNBatches);
+
+		delete [] chost;
+		delete [] nhost;
+	}
+//	copy buffer to buffer
+	contactNative->write( *data->m_contactBuffer, nContacts );
+	DeviceUtils::waitForCompletion( data->m_device );
+
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			HostBuffer<Contact4> host( deviceHost, nContacts );
+			contactNative->read( host.m_ptr, nContacts );
+			DeviceUtils::waitForCompletion( data->m_device );
+
+			for(int i=0; i<nContacts; i++)
+			{
+				ADLASSERT( host[i].m_bodyAPtr <= (u32)staticIdx );
+				ADLASSERT( host[i].m_bodyBPtr <= (u32)staticIdx );
+			}
+		}
+		DeviceUtils::deallocate( deviceHost );
+	}
+
+	BufferUtils::unmap<true>( contactNative, contacts );
+	BufferUtils::unmap<false>( nNative, n );
+	BufferUtils::unmap<false>( offsetsNative, offsets );
+}
+
+#undef PATH
+#undef KERNEL1
+#undef KERNEL2
+
+#undef KERNEL3
+#undef KERNEL4
+#undef KERNEL5
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverHost.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverHost.inl
@@ -0,0 +1,848 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+class SolverInl
+{
+public:
+	typedef SolverBase::ConstraintData ConstraintData;
+
+
+	static
+	__forceinline
+	void setLinearAndAngular(const MYF4& n, const MYF4& r0, const MYF4& r1,
+							 MYF4& linear, MYF4& angular0, MYF4& angular1)
+	{
+		linear = -n;
+		angular0 = -cross3(r0, n);
+		angular1 = cross3(r1, n);
+	}
+
+	static
+	__forceinline
+	float calcJacCoeff(const MYF4& linear0, const MYF4& linear1, const MYF4& angular0, const MYF4& angular1,
+					  float invMass0, const Matrix3x3& invInertia0, float invMass1, const Matrix3x3& invInertia1)
+	{
+		//	linear0,1 are normlized
+		float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
+		float jmj1 = dot3F4(mtMul3(angular0,invInertia0), angular0);
+		float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
+		float jmj3 = dot3F4(mtMul3(angular1,invInertia1), angular1);
+		return -1.f/(jmj0+jmj1+jmj2+jmj3);
+	}
+	static
+	__forceinline
+	float calcRelVel(const MYF4& l0, const MYF4& l1, const MYF4& a0, const MYF4& a1, 
+					 const MYF4& linVel0, const MYF4& angVel0, const MYF4& linVel1, const MYF4& angVel1)
+	{
+		return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
+	}
+
+	static
+	__forceinline
+	void setConstraint4( const MYF4& posA, const MYF4& linVelA, const MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA, 
+		const MYF4& posB, const MYF4& linVelB, const MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		const Contact4& src, const SolverBase::ConstraintCfg& cfg, 
+		Constraint4& dstC )
+	{
+		dstC.m_bodyA = (u32)src.m_bodyAPtr;
+		dstC.m_bodyB = (u32)src.m_bodyBPtr;
+
+		float dtInv = 1.f/cfg.m_dt;
+		for(int ic=0; ic<4; ic++)
+		{
+			dstC.m_appliedRambdaDt[ic] = 0.f;
+		}
+		dstC.m_fJacCoeffInv[0] = dstC.m_fJacCoeffInv[1] = 0.f;
+
+
+		const MYF4& n = src.m_worldNormal;
+		dstC.m_linear = -n;
+		dstC.setFrictionCoeff( src.getFrictionCoeff() );
+		for(int ic=0; ic<4; ic++)
+		{
+			MYF4 r0 = src.m_worldPos[ic] - posA;
+			MYF4 r1 = src.m_worldPos[ic] - posB;
+
+			if( ic >= src.getNPoints() )
+			{
+				dstC.m_jacCoeffInv[ic] = 0.f;
+				continue;
+			}
+
+			float relVelN;
+			{
+				MYF4 linear, angular0, angular1;
+				setLinearAndAngular(n, r0, r1, linear, angular0, angular1);
+
+				dstC.m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+					invMassA, invInertiaA, invMassB, invInertiaB );
+
+				relVelN = calcRelVel(linear, -linear, angular0, angular1,
+					linVelA, angVelA, linVelB, angVelB);
+
+				float e = src.getRestituitionCoeff();
+				if( relVelN*relVelN < 0.004f ) e = 0.f;
+
+				dstC.m_b[ic] = e*relVelN;
+				dstC.m_b[ic] += (src.getPenetration(ic) + cfg.m_positionDrift)*cfg.m_positionConstraintCoeff*dtInv;
+				dstC.m_appliedRambdaDt[ic] = 0.f;
+			}
+		}
+
+		if( src.getNPoints() > 1 )
+		{	//	prepare friction
+			MYF4 center = MAKE_MYF4(0.f);
+			for(int i=0; i<src.getNPoints(); i++) center += src.m_worldPos[i];
+			center /= (float)src.getNPoints();
+
+			MYF4 tangent[2];
+			tangent[0] = cross3( src.m_worldNormal, src.m_worldPos[0]-center );
+			tangent[1] = cross3( tangent[0], src.m_worldNormal );
+			tangent[0] = normalize3( tangent[0] );
+			tangent[1] = normalize3( tangent[1] );
+			MYF4 r[2];
+			r[0] = center - posA;
+			r[1] = center - posB;
+
+			for(int i=0; i<2; i++)
+			{
+				MYF4 linear, angular0, angular1;
+				setLinearAndAngular(tangent[i], r[0], r[1], linear, angular0, angular1);
+
+				dstC.m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+					invMassA, invInertiaA, invMassB, invInertiaB );
+				dstC.m_fAppliedRambdaDt[i] = 0.f;
+			}
+			dstC.m_center = center;
+		}
+		else
+		{
+			//	single point constraint
+		}
+
+		for(int i=0; i<4; i++)
+		{
+			if( i<src.getNPoints() )
+			{
+				dstC.m_worldPos[i] = src.m_worldPos[i];
+			}
+			else
+			{
+				dstC.m_worldPos[i] = MAKE_MYF4(0.f);
+			}
+		}
+	}
+
+/*
+	struct Constraint4
+	{
+		float4 m_linear;			X
+		float4 m_angular0[4];		X
+		float4 m_angular1[4];		center
+		float m_jacCoeffInv[4];		[0,1]
+		float m_b[4];				X
+		float m_appliedRambdaDt[4];	[0,1]
+
+		void* m_bodyAPtr;			X
+		void* m_bodyBPtr;			X
+	};
+*/
+	static
+	__inline
+	void solveFriction(Constraint4& cs, 
+		const MYF4& posA, MYF4& linVelA, MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA,
+		const MYF4& posB, MYF4& linVelB, MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+		if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
+		const MYF4& center = cs.m_center;
+
+		MYF4 n = -cs.m_linear;
+
+		MYF4 tangent[2];
+		tangent[0] = cross3( n, cs.m_worldPos[0]-center );
+		tangent[1] = cross3( tangent[0], n );
+		tangent[0] = normalize3( tangent[0] );
+		tangent[1] = normalize3( tangent[1] );
+
+		MYF4 angular0, angular1, linear;
+		MYF4 r0 = center - posA;
+		MYF4 r1 = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 );
+			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB );
+			rambdaDt *= cs.m_fJacCoeffInv[i];
+
+				{
+					float prevSum = cs.m_fAppliedRambdaDt[i];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[i] );
+					updated = min2( updated, maxRambdaDt[i] );
+					rambdaDt = updated - prevSum;
+					cs.m_fAppliedRambdaDt[i] = updated;
+				}
+
+			MYF4 linImp0 = invMassA*linear*rambdaDt;
+			MYF4 linImp1 = invMassB*(-linear)*rambdaDt;
+			MYF4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+			MYF4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+
+			linVelA += linImp0;
+			angVelA += angImp0;
+			linVelB += linImp1;
+			angVelB += angImp1;
+		}
+
+		{	//	angular damping for point constraint
+			MYF4 ab = normalize3( posB - posA );
+			MYF4 ac = normalize3( center - posA );
+			if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+			{
+				float angNA = dot3F4( n, angVelA );
+				float angNB = dot3F4( n, angVelB );
+
+				angVelA -= (angNA*0.1f)*n;
+				angVelB -= (angNB*0.1f)*n;
+			}
+		}
+	}
+
+	template<bool JACOBI>
+	static
+	__inline
+	void solveContact(Constraint4& cs, 
+		const MYF4& posA, MYF4& linVelA, MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA,
+		const MYF4& posB, MYF4& linVelB, MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+		MYF4 dLinVelA = MAKE_MYF4(0.f);
+		MYF4 dAngVelA = MAKE_MYF4(0.f);
+		MYF4 dLinVelB = MAKE_MYF4(0.f);
+		MYF4 dAngVelB = MAKE_MYF4(0.f);
+
+		for(int ic=0; ic<4; ic++)
+		{
+			//	dont necessary because this makes change to 0
+			if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+
+			{
+				MYF4 angular0, angular1, linear;
+				MYF4 r0 = cs.m_worldPos[ic] - posA;
+				MYF4 r1 = cs.m_worldPos[ic] - posB;
+				setLinearAndAngular( -cs.m_linear, r0, r1, linear, angular0, angular1 );
+
+				float rambdaDt = calcRelVel(cs.m_linear, -cs.m_linear, angular0, angular1,
+					linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
+				rambdaDt *= cs.m_jacCoeffInv[ic];
+
+				{
+					float prevSum = cs.m_appliedRambdaDt[ic];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[ic] );
+					updated = min2( updated, maxRambdaDt[ic] );
+					rambdaDt = updated - prevSum;
+					cs.m_appliedRambdaDt[ic] = updated;
+				}
+
+				MYF4 linImp0 = invMassA*linear*rambdaDt;
+				MYF4 linImp1 = invMassB*(-linear)*rambdaDt;
+				MYF4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+				MYF4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+
+				if( JACOBI )
+				{
+					dLinVelA += linImp0;
+					dAngVelA += angImp0;
+					dLinVelB += linImp1;
+					dAngVelB += angImp1;
+				}
+				else
+				{
+					linVelA += linImp0;
+					angVelA += angImp0;
+					linVelB += linImp1;
+					angVelB += angImp1;
+				}
+			}
+		}
+
+		if( JACOBI )
+		{
+			linVelA += dLinVelA;
+			angVelA += dAngVelA;
+			linVelB += dLinVelB;
+			angVelB += dAngVelB;
+		}
+	}
+
+	enum
+	{
+		N_SPLIT = SolverBase::N_SPLIT,
+	};
+
+	//	for parallel solve
+	struct ParallelSolveData
+	{
+		u32 m_n[N_SPLIT*N_SPLIT];
+		u32 m_offset[N_SPLIT*N_SPLIT];
+	};
+
+	static
+	__inline
+	int sortConstraintByBatch(Contact4* cs, int n, int ignoreIdx, int simdWidth = -1)
+	{
+		SortData* sortData;
+		{
+			BT_PROFILE("new");
+			sortData = new SortData[n];
+		}
+
+		u32* idxBuffer = new u32[n];
+		u32* idxSrc = idxBuffer;
+		u32* idxDst = idxBuffer;
+		int nIdxSrc, nIdxDst;
+
+		const int N_FLG = 256;
+		const int FLG_MASK = N_FLG-1;
+		u32 flg[N_FLG/32];
+#if defined(_DEBUG)
+		for(int i=0; i<n; i++) cs[i].getBatchIdx() = -1; 
+#endif
+		for(int i=0; i<n; i++) idxSrc[i] = i;
+		nIdxSrc = n;
+
+		int batchIdx = 0;
+
+		{
+			BT_PROFILE("batching");
+			while( nIdxSrc )
+			{
+				nIdxDst = 0;
+				int nCurrentBatch = 0;
+
+				//	clear flag
+				for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+
+				for(int i=0; i<nIdxSrc; i++)
+				{
+					int idx = idxSrc[i];
+					ADLASSERT( idx < n );
+					//	check if it can go
+					int aIdx = cs[idx].m_bodyAPtr & FLG_MASK;
+					int bIdx = cs[idx].m_bodyBPtr & FLG_MASK;
+
+					u32 aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31));
+					u32 bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31));
+
+					aUnavailable = (ignoreIdx==cs[idx].m_bodyAPtr)? 0:aUnavailable;
+					bUnavailable = (ignoreIdx==cs[idx].m_bodyBPtr)? 0:bUnavailable;
+
+					if( aUnavailable==0 && bUnavailable==0 ) // ok 
+					{
+						flg[ aIdx/32 ] |= (1<<(aIdx&31));
+						flg[ bIdx/32 ] |= (1<<(bIdx&31));
+						cs[idx].getBatchIdx() = batchIdx;
+						sortData[idx].m_key = batchIdx;
+						sortData[idx].m_value = idx;
+
+						{
+							nCurrentBatch++;
+							if( nCurrentBatch == simdWidth )
+							{
+								nCurrentBatch = 0;
+								for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+							}
+						}
+					}
+					else
+					{
+						idxDst[nIdxDst++] = idx;
+					}
+				}
+				swap2( idxSrc, idxDst );
+				swap2( nIdxSrc, nIdxDst );
+				batchIdx ++;
+			}
+		}
+
+		
+
+		{
+			BT_PROFILE("radix sort data");
+			//	sort SortData
+			Device::Config cfg;
+			Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, cfg );
+			{
+				Buffer<SortData> sortBuffer; sortBuffer.setRawPtr( deviceHost, sortData, n );
+				RadixSort<TYPE_HOST>::Data* sort = RadixSort<TYPE_HOST>::allocate( deviceHost, n );
+
+				RadixSort<TYPE_HOST>::execute( sort, sortBuffer, n );
+
+				RadixSort<TYPE_HOST>::deallocate( sort );
+			}
+			DeviceUtils::deallocate( deviceHost );
+		}
+
+		{	
+				BT_PROFILE("reorder");
+			//	reorder
+			Contact4* old = new Contact4[n];
+			memcpy( old, cs, sizeof(Contact4)*n);
+			for(int i=0; i<n; i++)
+			{
+				int idx = sortData[i].m_value;
+				cs[i] = old[idx];
+			}
+			delete [] old;
+		}
+
+		{
+			BT_PROFILE("delete");
+			delete [] idxBuffer;
+			delete [] sortData;
+		}
+#if defined(_DEBUG)
+//		debugPrintf( "nBatches: %d\n", batchIdx );
+		for(int i=0; i<n; i++) ADLASSERT( cs[i].getBatchIdx() != -1 );
+#endif
+		return batchIdx;
+	}
+};
+
+
+
+enum
+{
+//	N_SPLIT = SOLVER_N_SPLIT,
+//	MAX_TASKS_PER_BATCH = N_SPLIT*N_SPLIT/4,
+};
+
+struct SolveTask// : public ThreadPool::Task
+{
+	SolveTask(const Buffer<RigidBodyBase::Body>* bodies, const Buffer<RigidBodyBase::Inertia>* shapes, const Buffer<Constraint4>* constraints,
+		int start, int nConstraints)
+		: m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ),
+		m_solveFriction( true ){}
+
+	u16 getType(){ return 0; }
+
+	void run(int tIdx)
+	{
+		HostBuffer<RigidBodyBase::Body>& hBody = *(HostBuffer<RigidBodyBase::Body>*)m_bodies;
+		HostBuffer<RigidBodyBase::Inertia>& hShape = *(HostBuffer<RigidBodyBase::Inertia>*)m_shapes;
+		HostBuffer<Constraint4>& hc = *(HostBuffer<Constraint4>*)m_constraints;
+
+		for(int ic=0; ic<m_nConstraints; ic++)
+		{
+			int i = m_start + ic;
+
+			float frictionCoeff = hc[i].getFrictionCoeff();
+			int aIdx = (int)hc[i].m_bodyA;
+			int bIdx = (int)hc[i].m_bodyB;
+			RigidBodyBase::Body& bodyA = hBody[aIdx];
+			RigidBodyBase::Body& bodyB = hBody[bIdx];
+
+			if( !m_solveFriction )
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+				SolverInl::solveContact<false>( hc[i], bodyA.m_pos, (MYF4&)bodyA.m_linVel, (MYF4&)bodyA.m_angVel, bodyA.m_invMass, hShape[aIdx].m_invInertia, 
+					bodyB.m_pos, (MYF4&)bodyB.m_linVel, (MYF4&)bodyB.m_angVel, bodyB.m_invMass, hShape[bIdx].m_invInertia,
+					maxRambdaDt, minRambdaDt );
+			}
+			else
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+				float sum = 0;
+				for(int j=0; j<4; j++)
+				{
+					sum +=hc[i].m_appliedRambdaDt[j];
+				}
+				frictionCoeff = 0.7f;
+				for(int j=0; j<4; j++)
+				{
+					maxRambdaDt[j] = frictionCoeff*sum;
+					minRambdaDt[j] = -maxRambdaDt[j];
+				}
+
+				SolverInl::solveFriction( hc[i], bodyA.m_pos, (MYF4&)bodyA.m_linVel, (MYF4&)bodyA.m_angVel, bodyA.m_invMass, hShape[aIdx].m_invInertia, 
+					bodyB.m_pos, (MYF4&)bodyB.m_linVel, (MYF4&)bodyB.m_angVel, bodyB.m_invMass, hShape[bIdx].m_invInertia,
+					maxRambdaDt, minRambdaDt );
+			}
+		}
+	}
+
+	const Buffer<RigidBodyBase::Body>* m_bodies;
+	const Buffer<RigidBodyBase::Inertia>* m_shapes;
+	const Buffer<Constraint4>* m_constraints;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+};
+
+
+template<>
+static Solver<adl::TYPE_HOST>::Data* Solver<adl::TYPE_HOST>::allocate( const Device* device, int pairCapacity )
+{
+	Solver<adl::TYPE_HOST>::Data* data = new Data;
+	data->m_device = device;
+	data->m_parallelSolveData = 0;
+
+	return data;
+}
+
+template<>
+static void Solver<adl::TYPE_HOST>::deallocate( Solver<TYPE_HOST>::Data* data )
+{
+	if( data->m_parallelSolveData ) delete (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+	delete data;
+}
+
+
+void sortContacts2(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+	HostBuffer<RigidBodyBase::Body>* bodyNative 
+		= (HostBuffer<RigidBodyBase::Body>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	HostBuffer<Contact4>* contactNative 
+		= (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contactsIn);
+
+	if( cfg.m_enableParallelSolve )
+	{
+		ADLASSERT( data->m_parallelSolveData == 0 );
+		data->m_parallelSolveData = new SolverInl::ParallelSolveData;
+		SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		HostBuffer<SortData> sortData( data->m_device, nContacts );
+		{	//	2. set cell idx
+			float spacing = adl::SolverBase::N_OBJ_PER_SPLIT*cfg.m_averageExtent;
+			float xScale = 1.f/spacing;
+			for(int i=0; i<nContacts; i++)
+			{
+				int idx = ((*contactNative)[i].m_bodyAPtr==cfg.m_staticIdx)? (*contactNative)[i].m_bodyBPtr:(*contactNative)[i].m_bodyAPtr;
+				float4& p = (*bodyNative)[idx].m_pos;
+				int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*xScale)&(adl::SolverBase::N_SPLIT-1);
+				int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*xScale)&(adl::SolverBase::N_SPLIT-1);
+				ADLASSERT( xIdx >= 0 && xIdx < adl::SolverBase::N_SPLIT );
+				ADLASSERT( zIdx >= 0 && zIdx < adl::SolverBase::N_SPLIT );
+				sortData[i].m_key = (xIdx+zIdx*adl::SolverBase::N_SPLIT);
+				sortData[i].m_value = i;
+			}
+		}
+
+		{	//	3. sort by cell idx
+			RadixSort<TYPE_HOST>::Data* sData = RadixSort<TYPE_HOST>::allocate( data->m_device, nContacts );
+
+			RadixSort<TYPE_HOST>::execute( sData, sortData, nContacts );
+
+			RadixSort<TYPE_HOST>::deallocate( sData );
+		}
+
+		{	//	4. find entries
+			HostBuffer<u32> counts; counts.setRawPtr( data->m_device, solveData->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+			HostBuffer<u32> offsets; offsets.setRawPtr( data->m_device, solveData->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+			{
+				BoundSearch<TYPE_HOST>::Data* sData = BoundSearch<TYPE_HOST>::allocate( data->m_device );
+				PrefixScan<TYPE_HOST>::Data* pData = PrefixScan<TYPE_HOST>::allocate( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+
+				BoundSearch<TYPE_HOST>::execute( sData, sortData, nContacts, counts, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT, BoundSearchBase::COUNT );
+
+				PrefixScan<TYPE_HOST>::execute( pData, counts, offsets, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				
+				BoundSearch<TYPE_HOST>::deallocate( sData );
+				PrefixScan<TYPE_HOST>::deallocate( pData );
+			}
+#if defined(_DEBUG)
+			{
+				HostBuffer<u32> n0( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				HostBuffer<u32> offset0( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					n0[i] = 0;
+					offset0[i] = 0;
+				}
+
+				for(int i=0; i<nContacts; i++)
+				{
+					int idx = sortData[i].m_key;
+					n0[idx]++;
+				}
+
+				//	scan
+				int sum = 0;
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					offset0[i] = sum;
+					sum += n0[i];
+				}
+
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					ADLASSERT( n0[i] == counts[i] );
+					ADLASSERT( offset0[i] == offsets[i] );
+				}
+			}
+#endif
+		}
+
+		{	//	5. sort constraints by cellIdx
+			Contact4* old = new Contact4[nContacts];
+			memcpy( old, contactNative->m_ptr, sizeof(Contact4)*nContacts );
+			for(int i=0; i<nContacts; i++)
+			{
+				int srcIdx = sortData[i].m_value;
+				(*contactNative)[i] = old[srcIdx];
+			}
+			delete [] old;
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<true>( contactNative, contactsIn );
+}
+
+static void reorderConvertToConstraints2( Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf,
+	adl::Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	
+	
+	sortContacts2( data, bodyBuf, contactsIn, additionalData, nContacts, cfg );
+
+	{
+		SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+		Buffer<u32> n; n.setRawPtr( data->m_device, solveData->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+		Buffer<u32> offsets; offsets.setRawPtr( data->m_device, solveData->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+		Solver<TYPE_HOST>::batchContacts( data, contactsIn, nContacts, &n, &offsets, cfg.m_staticIdx );
+		printf("hello\n");
+	}
+	
+	Solver<TYPE_HOST>::convertToConstraints( data, bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg );
+}
+
+template<DeviceType TYPE>
+static void solveContactConstraint(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, void* additionalData, int n )
+{
+
+	Buffer<RigidBodyBase::Body>* bodyNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	Buffer<RigidBodyBase::Inertia>* shapeNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, shapeBuf );
+	Buffer<Constraint4>* constraintNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, (const Buffer<Constraint4>*)constraint );
+
+	for(int iter=0; iter<data->m_nIterations; iter++)
+	{
+		SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
+		task.m_solveFriction = false;
+		task.run(0);
+	}
+
+	for(int iter=0; iter<data->m_nIterations; iter++)
+	{
+		SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
+		task.m_solveFriction = true;
+		task.run(0);
+	}
+
+	BufferUtils::unmap<true>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( shapeNative, shapeBuf );
+	BufferUtils::unmap<false>( constraintNative, (const Buffer<Constraint4>*)constraint );
+}
+
+#if 0
+static
+int createSolveTasks( int batchIdx, Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, int n, ThreadPool::Task* tasksOut[], int taskCapacity )
+{
+/*
+	ADLASSERT( (N_SPLIT&1) == 0 );
+	ADLASSERT( batchIdx < N_BATCHES );
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+	ADLASSERT( data->m_parallelSolveData );
+
+	SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+	data->m_batchIdx = 0;
+
+	const int nx = N_SPLIT/2;
+
+	int nTasksCreated = 0;
+
+//	for(int ii=0; ii<2; ii++)
+	for(batchIdx=0; batchIdx<4; batchIdx++)
+	{
+		int2 offset = make_int2( batchIdx&1, batchIdx>>1 );
+		for(int ix=0; ix<nx; ix++) for(int iy=0; iy<nx; iy++)
+		{
+			int xIdx = ix*2 + offset.x;
+			int yIdx = iy*2 + offset.y;
+			int cellIdx = xIdx+yIdx*N_SPLIT;
+
+			int n = solveData->m_n[cellIdx];
+			int start = solveData->m_offset[cellIdx];
+
+			if( n == 0 ) continue;
+
+			SolveTask* task = new SolveTask( bodyBuf, shapeBuf, (const Buffer<Constraint4>*)constraint, start, n );
+//			task->m_solveFriction = (ii==0)? false:true;
+			tasksOut[nTasksCreated++] = task;
+		}
+	}
+
+	return nTasksCreated;
+*/
+	ADLASSERT(0);
+	return 0;
+}
+#endif
+
+
+
+static void convertToConstraints2(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+
+	HostBuffer<RigidBodyBase::Body>* bodyNative 
+		= (HostBuffer<RigidBodyBase::Body>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	HostBuffer<RigidBodyBase::Inertia>* shapeNative 
+		= (HostBuffer<RigidBodyBase::Inertia>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, shapeBuf );
+	HostBuffer<Contact4>* contactNative 
+		= (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contactsIn );
+	HostBuffer<Constraint4>* constraintNative 
+		= (HostBuffer<Constraint4>*)BufferUtils::map<TYPE_HOST, false>( data->m_device, (Buffer<Constraint4>*)contactCOut );
+
+	{
+#if !defined(_DEBUG)
+#pragma omp parallel for
+#endif
+		for(int i=0; i<nContacts; i++)
+		{
+//			new (constraintNative+i)Constraint4;
+			Contact4& contact = (*contactNative)[i];
+
+			if( contact.isInvalid() ) continue;
+
+			int aIdx = (int)contact.m_bodyAPtr;
+			int bIdx = (int)contact.m_bodyBPtr;
+
+			{
+				const RigidBodyBase::Body& bodyA = (*bodyNative)[aIdx];
+				const RigidBodyBase::Body& bodyB = (*bodyNative)[bIdx];
+				MYF4 posA( bodyA.m_pos );
+				MYF4 linVelA( bodyA.m_linVel );
+				MYF4 angVelA( bodyA.m_angVel );
+				MYF4 posB( bodyB.m_pos );
+				MYF4 linVelB( bodyB.m_linVel );
+				MYF4 angVelB( bodyB.m_angVel );
+
+				bool aIsInactive = ( isZero( linVelA ) && isZero( angVelA ) );
+				bool bIsInactive = ( isZero( linVelB ) && isZero( angVelB ) );
+
+				SolverInl::setConstraint4( posA, linVelA, angVelA, 
+					//(*bodyNative)[aIdx].m_invMass, (*shapeNative)[aIdx].m_invInertia,
+					(aIsInactive)? 0.f : (*bodyNative)[aIdx].m_invMass, (aIsInactive)? mtZero() : (*shapeNative)[aIdx].m_invInertia,
+					posB, linVelB, angVelB, 
+					//(*bodyNative)[bIdx].m_invMass, (*shapeNative)[bIdx].m_invInertia, 
+					(bIsInactive)? 0.f : (*bodyNative)[bIdx].m_invMass, (bIsInactive)? mtZero() : (*shapeNative)[bIdx].m_invInertia, 
+					contact, cfg, 
+					(*constraintNative)[i] );
+				(*constraintNative)[i].m_batchIdx = contact.getBatchIdx();
+			}
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( shapeNative, shapeBuf );
+	BufferUtils::unmap<false>( contactNative, contactsIn );
+	BufferUtils::unmap<true>( constraintNative, (Buffer<Constraint4>*)contactCOut );
+}
+
+
+
+
+
+static void batchContacts2(  Solver<TYPE_HOST>::Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+
+	HostBuffer<Contact4>* contactNative =0;
+	HostBuffer<u32>* nNative =0;
+	HostBuffer<u32>* offsetsNative =0;
+
+	int sz = sizeof(Contact4);
+	int sz2 = sizeof(int2);
+	{
+		BT_PROFILE("BufferUtils::map");
+		contactNative  = (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contacts, nContacts );
+	}
+	{
+		BT_PROFILE("BufferUtils::map2");
+		nNative = (HostBuffer<u32>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, n );
+		offsetsNative= (HostBuffer<u32>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, offsets );
+	}
+
+	
+	{
+		BT_PROFILE("sortConstraintByBatch");
+		int numNonzeroGrid=0;
+		int maxNumBatches = 0;
+
+		for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+		{
+			int n = (*nNative)[i];
+			int offset = (*offsetsNative)[i];
+
+			if( n ) 
+			{
+				numNonzeroGrid++;
+				int numBatches = SolverInl::sortConstraintByBatch( contactNative->m_ptr+offset, n, staticIdx,-1 );	//	on GPU
+				maxNumBatches = max(numBatches,maxNumBatches);
+
+	//			SolverInl::sortConstraintByBatch( contactNative->m_ptr+offset, n, staticIdx );	//	on CPU
+			}
+		}
+
+		printf("maxNumBatches = %d\n", maxNumBatches);
+	}
+
+	{
+		BT_PROFILE("BufferUtils::unmap");
+		BufferUtils::unmap<true>( contactNative, contacts, nContacts );
+	}
+	{
+		BT_PROFILE("BufferUtils::unmap2");
+		BufferUtils::unmap<false>( nNative, n );
+		BufferUtils::unmap<false>( offsetsNative, offsets );
+	}
+
+
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.cl
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.h
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.cl
@@ -0,0 +1,338 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+#define counter32_t volatile __global int*
+#endif
+
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+#define max2 max
+#define min2 min
+
+
+#define WG_SIZE 64
+
+
+
+typedef struct 
+{
+	float4 m_worldPos[4];
+	float4 m_worldNormal;
+	u32 m_coeffs;
+	int m_batchIdx;
+
+	u32 m_bodyA;
+	u32 m_bodyB;
+}Contact4;
+
+typedef struct 
+{
+	int m_n;
+	int m_start;
+	int m_staticIdx;
+	int m_paddings[1];
+} ConstBuffer;
+
+typedef struct 
+{
+	u32 m_a;
+	u32 m_b;
+	u32 m_idx;
+}Elem;
+
+#define STACK_SIZE (WG_SIZE*10)
+//#define STACK_SIZE (WG_SIZE)
+#define RING_SIZE 1024
+#define RING_SIZE_MASK (RING_SIZE-1)
+#define CHECK_SIZE (WG_SIZE)
+
+
+#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
+#define RING_END ldsTmp
+
+u32 readBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	return buff[bufIdx] & (1<<bitIdx);
+}
+
+void writeBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+//	buff[bufIdx] |= (1<<bitIdx);
+	atom_or( &buff[bufIdx], (1<<bitIdx) );
+}
+
+u32 tryWrite(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
+	return ((ans >> bitIdx)&1) == 0;
+}
+
+//	batching on the GPU
+__kernel void CreateBatches( __global Contact4* gConstraints, __global Contact4* gConstraintsOut,
+		__global u32* gN, __global u32* gStart, 
+		ConstBuffer cb )
+{
+	__local u32 ldsStackIdx[STACK_SIZE];
+	__local u32 ldsStackEnd;
+	__local Elem ldsRingElem[RING_SIZE];
+	__local u32 ldsRingEnd;
+	__local u32 ldsTmp;
+	__local u32 ldsCheckBuffer[CHECK_SIZE];
+	__local u32 ldsFixedBuffer[CHECK_SIZE];
+	__local u32 ldsGEnd;
+	__local u32 ldsDstEnd;
+
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	
+	const int m_n = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+	const int m_staticIdx = cb.m_staticIdx;
+		
+	if( lIdx == 0 )
+	{
+		ldsRingEnd = 0;
+		ldsGEnd = 0;
+		ldsStackEnd = 0;
+		ldsDstEnd = m_start;
+	}
+	
+//	while(1)
+	for(int ie=0; ie<250; ie++)
+	{
+		ldsFixedBuffer[lIdx] = 0;
+
+		for(int giter=0; giter<4; giter++)
+		{
+			int ringCap = GET_RING_CAPACITY;
+		
+			//	1. fill ring
+			if( ldsGEnd < m_n )
+			{
+				while( ringCap > WG_SIZE )
+				{
+					if( ldsGEnd >= m_n ) break;
+					if( lIdx < ringCap - WG_SIZE )
+					{
+						int srcIdx;
+						AtomInc1( ldsGEnd, srcIdx );
+						if( srcIdx < m_n )
+						{
+							int dstIdx;
+							AtomInc1( ldsRingEnd, dstIdx );
+							
+							int a = gConstraints[m_start+srcIdx].m_bodyA;
+							int b = gConstraints[m_start+srcIdx].m_bodyB;
+							ldsRingElem[dstIdx].m_a = (a>b)? b:a;
+							ldsRingElem[dstIdx].m_b = (a>b)? a:b;
+							ldsRingElem[dstIdx].m_idx = srcIdx;
+						}
+					}
+					ringCap = GET_RING_CAPACITY;
+				}
+			}
+
+			GROUP_LDS_BARRIER;
+	
+			//	2. fill stack
+			__local Elem* dst = ldsRingElem;
+			if( lIdx == 0 ) RING_END = 0;
+
+			int srcIdx=lIdx;
+			int end = ldsRingEnd;
+
+			{
+				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
+				{
+					Elem e;
+					if(srcIdx<end) e = ldsRingElem[srcIdx];
+					bool done = (srcIdx<end)?false:true;
+
+					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
+					
+					if( !done )
+					{
+						int aUsed = readBuf( ldsFixedBuffer, e.m_a);
+						int bUsed = readBuf( ldsFixedBuffer, e.m_b);
+
+						if( aUsed==0 && bUsed==0 )
+						{
+							int aAvailable;
+							int bAvailable;
+
+							aAvailable = tryWrite( ldsCheckBuffer, e.m_a );
+							bAvailable = tryWrite( ldsCheckBuffer, e.m_b );
+
+							//aAvailable = (m_staticIdx == e.m_a)? 1: aAvailable;
+							//bAvailable = (m_staticIdx == e.m_b)? 1: bAvailable;
+
+							bool success = (aAvailable && bAvailable);
+							if(success)
+							{
+								writeBuf( ldsFixedBuffer, e.m_a );
+								writeBuf( ldsFixedBuffer, e.m_b );
+							}
+							done = success;
+						}
+					}
+
+					//	put it aside
+					if(srcIdx<end)
+					{
+						if( done )
+						{
+							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
+							if( dstIdx < STACK_SIZE )
+								ldsStackIdx[dstIdx] = e.m_idx;
+							else{
+								done = false;
+								AtomAdd( ldsStackEnd, -1 );
+							}
+						}
+						if( !done )
+						{
+							int dstIdx; AtomInc1( RING_END, dstIdx );
+							dst[dstIdx] = e;
+						}
+					}
+
+					//	if filled, flush
+					if( ldsStackEnd == STACK_SIZE )
+					{
+						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
+						{
+							int idx = m_start + ldsStackIdx[i];
+							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+							gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+						}
+						if( lIdx == 0 ) ldsStackEnd = 0;
+
+						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) 
+						ldsFixedBuffer[lIdx] = 0;
+					}
+				}
+			}
+
+			if( lIdx == 0 ) ldsRingEnd = RING_END;
+		}
+
+		GROUP_LDS_BARRIER;
+
+		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
+		{
+			int idx = m_start + ldsStackIdx[i];
+			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+			gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+		}
+
+		//	in case it couldn't consume any pair. Flush them
+		//	todo. Serial batch worth while?
+		if( ldsStackEnd == 0 )
+		{
+			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
+			{
+				int idx = m_start + ldsRingElem[i].m_idx;
+				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+				gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;
+			}
+			GROUP_LDS_BARRIER;
+			if( lIdx == 0 ) ldsRingEnd = 0;
+		}
+
+		if( lIdx == 0 ) ldsStackEnd = 0;
+
+		GROUP_LDS_BARRIER;
+
+		//	termination
+		if( ldsGEnd == m_n && ldsRingEnd == 0 )
+			break;
+	}
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.h
@@ -0,0 +1,371 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+static const char* batchingKernelsCL= \
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"#define counter32_t volatile __global int*\n"
+"#endif\n"
+"\n"
+"\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"\n"
+"\n"
+"#define WG_SIZE 64\n"
+"\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_worldNormal;\n"
+"	u32 m_coeffs;\n"
+"	int m_batchIdx;\n"
+"\n"
+"	u32 m_bodyA;\n"
+"	u32 m_bodyB;\n"
+"}Contact4;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_n;\n"
+"	int m_start;\n"
+"	int m_staticIdx;\n"
+"	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	u32 m_a;\n"
+"	u32 m_b;\n"
+"	u32 m_idx;\n"
+"}Elem;\n"
+"\n"
+"#define STACK_SIZE (WG_SIZE*10)\n"
+"//#define STACK_SIZE (WG_SIZE)\n"
+"#define RING_SIZE 1024\n"
+"#define RING_SIZE_MASK (RING_SIZE-1)\n"
+"#define CHECK_SIZE (WG_SIZE)\n"
+"\n"
+"\n"
+"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
+"#define RING_END ldsTmp\n"
+"\n"
+"u32 readBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	return buff[bufIdx] & (1<<bitIdx);\n"
+"}\n"
+"\n"
+"void writeBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"//	buff[bufIdx] |= (1<<bitIdx);\n"
+"	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"}\n"
+"\n"
+"u32 tryWrite(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"	return ((ans >> bitIdx)&1) == 0;\n"
+"}\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_valInt0;\n"
+"	int m_valInt1;\n"
+"	int m_valInt2;\n"
+"	int m_valInt3;\n"
+"\n"
+"	int m_valInt4;\n"
+"	int m_valInt5;\n"
+"	int m_valInt6;\n"
+"	int m_valInt7;\n"
+"\n"
+"	int m_valInt8;\n"
+"	int m_valInt9;\n"
+"	int m_valInt10;\n"
+"	int m_valInt11;\n"
+"	\n"
+"	int	m_valInt12;\n"
+"	int	m_valInt13;\n"
+"	int	m_valInt14;\n"
+"	int	m_valInt15;\n"
+"\n"
+"\n"
+"	float m_fval0;\n"
+"	float m_fval1;\n"
+"	float m_fval2;\n"
+"	float m_fval3;\n"
+"} SolverDebugInfo;\n"
+"\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatches( __global Contact4* gConstraints, __global Contact4* gConstraintsOut, //__global u32* gRes, \n"
+"		__global u32* gN, __global u32* gStart, \n"
+"//		__global SolverDebugInfo* debugInfo, \n"
+"		ConstBuffer cb )\n"
+"{\n"
+"	__local u32 ldsStackIdx[STACK_SIZE];\n"
+"	__local u32 ldsStackEnd;\n"
+"	__local Elem ldsRingElem[RING_SIZE];\n"
+"	__local u32 ldsRingEnd;\n"
+"	__local u32 ldsTmp;\n"
+"	__local u32 ldsCheckBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsGEnd;\n"
+"	__local u32 ldsDstEnd;\n"
+"\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	\n"
+"	const int m_n = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"	const int m_staticIdx = cb.m_staticIdx;\n"
+"		\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		ldsRingEnd = 0;\n"
+"		ldsGEnd = 0;\n"
+"		ldsStackEnd = 0;\n"
+"		ldsDstEnd = m_start;\n"
+"	}\n"
+"	\n"
+"//	while(1)\n"
+"	for(int ie=0; ie<250; ie++)\n"
+"	{\n"
+"		ldsFixedBuffer[lIdx] = 0;\n"
+"\n"
+"		for(int giter=0; giter<4; giter++)\n"
+"		{\n"
+"			int ringCap = GET_RING_CAPACITY;\n"
+"		\n"
+"			//	1. fill ring\n"
+"			if( ldsGEnd < m_n )\n"
+"			{\n"
+"				while( ringCap > WG_SIZE )\n"
+"				{\n"
+"					if( ldsGEnd >= m_n ) break;\n"
+"					if( lIdx < ringCap - WG_SIZE )\n"
+"					{\n"
+"						int srcIdx;\n"
+"						AtomInc1( ldsGEnd, srcIdx );\n"
+"						if( srcIdx < m_n )\n"
+"						{\n"
+"							int dstIdx;\n"
+"							AtomInc1( ldsRingEnd, dstIdx );\n"
+"							\n"
+"							int a = gConstraints[m_start+srcIdx].m_bodyA;\n"
+"							int b = gConstraints[m_start+srcIdx].m_bodyB;\n"
+"							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
+"							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
+"							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
+"						}\n"
+"					}\n"
+"					ringCap = GET_RING_CAPACITY;\n"
+"				}\n"
+"			}\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"	\n"
+"			//	2. fill stack\n"
+"			__local Elem* dst = ldsRingElem;\n"
+"			if( lIdx == 0 ) RING_END = 0;\n"
+"\n"
+"			int srcIdx=lIdx;\n"
+"			int end = ldsRingEnd;\n"
+"\n"
+"			{\n"
+"				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
+"				{\n"
+"					Elem e;\n"
+"					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
+"					bool done = (srcIdx<end)?false:true;\n"
+"\n"
+"					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
+"					\n"
+"					if( !done )\n"
+"					{\n"
+"						int aUsed = readBuf( ldsFixedBuffer, e.m_a);\n"
+"						int bUsed = readBuf( ldsFixedBuffer, e.m_b);\n"
+"\n"
+"						if( aUsed==0 && bUsed==0 )\n"
+"						{\n"
+"							int aAvailable;\n"
+"							int bAvailable;\n"
+"\n"
+"							aAvailable = tryWrite( ldsCheckBuffer, e.m_a );\n"
+"							bAvailable = tryWrite( ldsCheckBuffer, e.m_b );\n"
+"\n"
+"							//aAvailable = (m_staticIdx == e.m_a)? 1: aAvailable;\n"
+"							//bAvailable = (m_staticIdx == e.m_b)? 1: bAvailable;\n"
+"\n"
+"							bool success = (aAvailable && bAvailable);\n"
+"							if(success)\n"
+"							{\n"
+"								writeBuf( ldsFixedBuffer, e.m_a );\n"
+"								writeBuf( ldsFixedBuffer, e.m_b );\n"
+"							}\n"
+"							done = success;\n"
+"						}\n"
+"					}\n"
+"\n"
+"					//	put it aside\n"
+"					if(srcIdx<end)\n"
+"					{\n"
+"						if( done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
+"							if( dstIdx < STACK_SIZE )\n"
+"								ldsStackIdx[dstIdx] = e.m_idx;\n"
+"							else{\n"
+"								done = false;\n"
+"								AtomAdd( ldsStackEnd, -1 );\n"
+"							}\n"
+"						}\n"
+"						if( !done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( RING_END, dstIdx );\n"
+"							dst[dstIdx] = e;\n"
+"						}\n"
+"					}\n"
+"\n"
+"					//	if filled, flush\n"
+"					if( ldsStackEnd == STACK_SIZE )\n"
+"					{\n"
+"						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
+"						{\n"
+"							int idx = m_start + ldsStackIdx[i];\n"
+"							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"						}\n"
+"						if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"\n"
+"						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
+"						ldsFixedBuffer[lIdx] = 0;\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"\n"
+"			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
+"		{\n"
+"			int idx = m_start + ldsStackIdx[i];\n"
+"			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"		}\n"
+"\n"
+"		//	in case it couldn't consume any pair. Flush them\n"
+"		//	todo. Serial batch worth while?\n"
+"		if( ldsStackEnd == 0 )\n"
+"		{\n"
+"			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
+"			{\n"
+"				int idx = m_start + ldsRingElem[i].m_idx;\n"
+"				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"				gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"			if( lIdx == 0 ) ldsRingEnd = 0;\n"
+"		}\n"
+"\n"
+"		if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		//	termination\n"
+"		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
+"			break;\n"
+"	}\n"
+"\n"
+"\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringify.py
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringify.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+import sys
+import os
+import shutil
+
+arg = sys.argv[1]
+fh = open(arg)
+	
+print 'static const char* '+sys.argv[2]+'= \\'
+for line in fh.readlines():
+	a = line.strip('\n')
+	print '"'+a+'\\n"'
+print ';'
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernels.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernels.bat
@@ -0,0 +1,6 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsAll.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsAll.bat
@@ -0,0 +1,10 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+stringify.py SolverKernels.cl solverKernelsCL >SolverKernels.h
+stringify.py batchingKernels.cl batchingKernelsCL >batchingKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsBatching.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsBatching.bat
@@ -0,0 +1,8 @@
+stringify.py batchingKernels.cl batchingKernelsCL >batchingKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsNarrowphase.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsNarrowphase.bat
@@ -0,0 +1,8 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsSolver.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsSolver.bat
@@ -0,0 +1,8 @@
+stringify.py SolverKernels.cl solverKernelsCL >SolverKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/main.cpp
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "BasicDemo.h"
+#include "GlutStuff.h"
+#include "btBulletDynamicsCommon.h"
+#include "LinearMath/btHashMap.h"
+
+#ifdef CL_PLATFORM_AMD
+#include "../../opencl/basic_initialize/btOpenCLUtils.h"
+extern cl_context			g_cxMainContext;
+extern cl_command_queue	g_cqCommandQue;
+extern cl_device_id		g_clDevice;
+#endif
+
+
+	
+int main(int argc,char** argv)
+{
+
+	#ifdef CL_PLATFORM_AMD
+	int ciErrNum = 0;
+	const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;//CPU;//GPU;
+	
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
+
+	if (numDev>0)
+	{
+		int deviceIndex =0;
+		g_clDevice = btOpenCLUtils::getDevice(g_cxMainContext,deviceIndex);
+		btOpenCLDeviceInfo clInfo;
+		btOpenCLUtils::getDeviceInfo(g_clDevice,clInfo);
+		btOpenCLUtils::printDeviceInfo(g_clDevice);
+		// create a command-queue
+		g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, g_clDevice, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+#endif //#ifdef CL_PLATFORM_AMD
+
+
+	BasicDemo ccdDemo;
+	ccdDemo.initPhysics();
+	
+
+#ifdef CHECK_MEMORY_LEAKS
+	ccdDemo.exitPhysics();
+#else
+	glutmain(argc, argv,1024,600,"Bullet Physics Demo. http://bulletphysics.org",&ccdDemo);
+#endif
+	
+	//setupGUI(1024,768);
+	glutMainLoop();
+	//default glut doesn't return from mainloop
+	return 0;
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/premake4.lua
@@ -0,0 +1,34 @@
+
+-- include "AMD"
+
+if os.is("Windows") then
+	
+		project "basic_bullet2_demo"
+
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../bin"
+
+  		includedirs {
+                ".",
+                "../../bullet2",
+                "../testbed",
+                	"../../rendering/Gwen",
+                }
+		
+
+		links { "testbed",
+			"bullet2",
+			"gwen"
+		}
+		
+		initOpenGL()
+		initGlut()
+	
+		files {
+		"**.cpp",
+		"**.h"
+		}
+
+end
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/DebugCastResult.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/DebugCastResult.h
@@ -0,0 +1,88 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef DEBUG_CAST_RESULT_H
+#define DEBUG_CAST_RESULT_H
+
+#include "BulletCollision/NarrowPhaseCollision/btConvexCast.h"
+#include "LinearMath/btTransform.h"
+#include "GL_ShapeDrawer.h"
+#include "GlutStuff.h"
+#ifdef WIN32
+#include <windows.h>
+#endif
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+#include <GL/gl.h>
+#endif
+struct btDebugCastResult : public btConvexCast::CastResult
+{
+
+	btTransform	m_fromTrans;
+	const btPolyhedralConvexShape* m_shape;
+	btVector3	m_linVel;
+	btVector3 m_angVel;
+	GL_ShapeDrawer* m_shapeDrawer;
+
+	btDebugCastResult(const btTransform& fromTrans,const btPolyhedralConvexShape* shape,
+					const btVector3& linVel,const btVector3& angVel,GL_ShapeDrawer* drawer)
+	:m_fromTrans(fromTrans),
+	m_shape(shape),
+	m_linVel(linVel),
+	m_angVel(angVel),
+	m_shapeDrawer(drawer)
+	{
+	}
+
+	virtual void drawCoordSystem(const btTransform& tr)  
+	{
+		btScalar m[16];
+		tr.getOpenGLMatrix(m);
+		glPushMatrix();
+		btglLoadMatrix(m);
+		glBegin(GL_LINES);
+		btglColor3(1, 0, 0);
+		btglVertex3(0, 0, 0);
+		btglVertex3(1, 0, 0);
+		btglColor3(0, 1, 0);
+		btglVertex3(0, 0, 0);
+		btglVertex3(0, 1, 0);
+		btglColor3(0, 0, 1);
+		btglVertex3(0, 0, 0);
+		btglVertex3(0, 0, 1);
+		glEnd();
+		glPopMatrix();
+	}
+
+	virtual void	DebugDraw(btScalar	fraction)
+	{
+		btVector3 worldBoundsMin(-1000,-1000,-1000);
+		btVector3 worldBoundsMax(1000,1000,1000);
+
+	
+		btScalar m[16];
+		btTransform hitTrans;
+		btTransformUtil::integrateTransform(m_fromTrans,m_linVel,m_angVel,fraction,hitTrans);
+		hitTrans.getOpenGLMatrix(m);
+		if (m_shapeDrawer)
+			m_shapeDrawer->drawOpenGL(m,m_shape,btVector3(1,0,0),btIDebugDraw::DBG_NoDebug,worldBoundsMin,worldBoundsMax);
+	}
+};
+
+
+#endif //DEBUG_CAST_RESULT_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.h
@@ -0,0 +1,257 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef DEMO_APPLICATION_H
+#define DEMO_APPLICATION_H
+
+
+#include "GlutStuff.h"
+#include "GL_ShapeDrawer.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btMatrix3x3.h"
+#include "LinearMath/btTransform.h"
+#include "LinearMath/btQuickprof.h"
+#include "LinearMath/btAlignedObjectArray.h"
+
+class	btCollisionShape;
+class	btDynamicsWorld;
+class	btRigidBody;
+class	btTypedConstraint;
+
+
+
+class DemoApplication
+{
+protected:
+	void	displayProfileString(int xOffset,int yStart,char* message);
+	class CProfileIterator* m_profileIterator;
+
+	protected:
+#ifdef USE_BT_CLOCK
+	btClock m_clock;
+#endif //USE_BT_CLOCK
+
+	///this is the most important class
+	btDynamicsWorld*		m_dynamicsWorld;
+
+	///constraint for mouse picking
+	btTypedConstraint*		m_pickConstraint;
+
+	virtual void removePickingConstraint();
+
+	btCollisionShape*	m_shootBoxShape;
+
+	float	m_cameraDistance;
+	int	m_debugMode;
+	
+	float m_ele;
+	float m_azi;
+	btVector3 m_cameraPosition;
+	btVector3 m_cameraTargetPosition;//look at
+
+	int	m_mouseOldX;
+	int	m_mouseOldY;
+	int	m_mouseButtons;
+public:
+	int	m_modifierKeys;
+protected:
+
+	float m_scaleBottom;
+	float m_scaleFactor;
+	btVector3 m_cameraUp;
+	int	m_forwardAxis;
+
+	int m_glutScreenWidth;
+	int m_glutScreenHeight;
+
+	float	m_frustumZNear;
+	float	m_frustumZFar;
+
+	int	m_ortho;
+
+	float	m_ShootBoxInitialSpeed;
+	
+	bool	m_stepping;
+	bool m_singleStep;
+	bool m_idle;
+	int m_lastKey;
+
+	virtual float  showProfileInfo(int& xOffset,int& yStart, int yIncr);
+	void renderscene(int pass);
+
+	GL_ShapeDrawer*	m_shapeDrawer;
+	bool			m_enableshadows;
+	btVector3		m_sundirection;
+	btScalar		m_defaultContactProcessingThreshold;
+
+public:
+		
+	DemoApplication();
+	
+	virtual ~DemoApplication();
+
+	btDynamicsWorld*		getDynamicsWorld()
+	{
+		return m_dynamicsWorld;
+	}
+
+	virtual	void initPhysics() = 0;
+
+	virtual	void setDrawClusters(bool drawClusters)
+	{
+
+	}
+
+	void overrideGLShapeDrawer (GL_ShapeDrawer* shapeDrawer);
+	
+	void setOrthographicProjection();
+	void resetPerspectiveProjection();
+	
+	bool	setTexturing(bool enable) { return(m_shapeDrawer->enableTexture(enable)); }
+	bool	setShadows(bool enable)	{ bool p=m_enableshadows;m_enableshadows=enable;return(p); }
+	bool	getTexturing() const
+	{
+		return m_shapeDrawer->hasTextureEnabled();
+	}
+	bool	getShadows() const
+	{
+		return m_enableshadows;
+	}
+
+
+	int		getDebugMode()
+	{
+		return m_debugMode ;
+	}
+	
+	void	setDebugMode(int mode);
+	
+	void	setAzi(float azi)
+	{
+		m_azi = azi;
+	}
+	
+	void	setCameraUp(const btVector3& camUp)
+	{
+		m_cameraUp = camUp;
+	}
+	void	setCameraForwardAxis(int axis)
+	{
+		m_forwardAxis = axis;
+	}
+
+	virtual void myinit();
+
+	void toggleIdle();
+	
+	virtual void updateCamera();
+
+	btVector3	getCameraPosition()
+	{
+		return m_cameraPosition;
+	}
+	btVector3	getCameraTargetPosition()
+	{
+		return m_cameraTargetPosition;
+	}
+
+	btScalar	getDeltaTimeMicroseconds()
+	{
+#ifdef USE_BT_CLOCK
+		btScalar dt = (btScalar)m_clock.getTimeMicroseconds();
+		m_clock.reset();
+		return dt;
+#else
+		return btScalar(16666.);
+#endif
+	}
+	void setFrustumZPlanes(float zNear, float zFar)
+	{
+		m_frustumZNear = zNear;
+		m_frustumZFar = zFar;
+	}
+
+	///glut callbacks
+				
+	float	getCameraDistance();
+	void	setCameraDistance(float dist);	
+	void	moveAndDisplay();
+
+	virtual void clientMoveAndDisplay() = 0;
+
+	virtual void	clientResetScene();
+
+	///Demo functions
+	virtual void setShootBoxShape ();
+	virtual void	shootBox(const btVector3& destination);
+
+
+	btVector3	getRayTo(int x,int y);
+
+	btRigidBody*	localCreateRigidBody(float mass, const btTransform& startTransform,btCollisionShape* shape);
+
+	///callback methods by glut	
+
+	virtual void keyboardCallback(unsigned char key, int x, int y);
+	
+	virtual void keyboardUpCallback(unsigned char key, int x, int y) {}
+	
+	virtual void specialKeyboard(int key, int x, int y){}
+
+	virtual void specialKeyboardUp(int key, int x, int y){}
+
+	virtual void reshape(int w, int h);
+
+	virtual void mouseFunc(int button, int state, int x, int y);
+
+	virtual void	mouseMotionFunc(int x,int y);
+	
+	virtual void displayCallback();
+
+	virtual 	void renderme();
+
+	virtual		void swapBuffers() = 0;
+
+	virtual		void	updateModifierKeys() = 0;
+
+	void stepLeft();
+	void stepRight();
+	void stepFront();
+	void stepBack();
+	void zoomIn();
+	void zoomOut();
+
+	bool	isIdle() const
+	{
+		return	m_idle;
+	}
+
+	void	setIdle(bool idle)
+	{
+		m_idle = idle;
+	}
+
+
+};
+
+#endif //DEMO_APPLICATION_H
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.cpp
@@ -0,0 +1,139 @@
+
+#include "GLDebugDrawer.h"
+#include "GLDebugFont.h"
+#include "GlutStuff.h"
+
+
+
+#include <stdio.h> //printf debugging
+GLDebugDrawer::GLDebugDrawer()
+:m_debugMode(0)
+{
+
+}
+
+void	GLDebugDrawer::drawLine(const btVector3& from,const btVector3& to,const btVector3& fromColor, const btVector3& toColor)
+{
+	glBegin(GL_LINES);
+		glColor3f(fromColor.getX(), fromColor.getY(), fromColor.getZ());
+		glVertex3d(from.getX(), from.getY(), from.getZ());
+		glColor3f(toColor.getX(), toColor.getY(), toColor.getZ());
+		glVertex3d(to.getX(), to.getY(), to.getZ());
+	glEnd();
+}
+
+void	GLDebugDrawer::drawLine(const btVector3& from,const btVector3& to,const btVector3& color)
+{
+	drawLine(from,to,color,color);
+}
+
+void GLDebugDrawer::drawSphere (const btVector3& p, btScalar radius, const btVector3& color)
+{
+	glColor4f (color.getX(), color.getY(), color.getZ(), btScalar(1.0f));
+	glPushMatrix ();
+	glTranslatef (p.getX(), p.getY(), p.getZ());
+
+	int lats = 5;
+	int longs = 5;
+
+	int i, j;
+	for(i = 0; i <= lats; i++) {
+		btScalar lat0 = SIMD_PI * (-btScalar(0.5) + (btScalar) (i - 1) / lats);
+		btScalar z0  = radius*sin(lat0);
+		btScalar zr0 =  radius*cos(lat0);
+
+		btScalar lat1 = SIMD_PI * (-btScalar(0.5) + (btScalar) i / lats);
+		btScalar z1 = radius*sin(lat1);
+		btScalar zr1 = radius*cos(lat1);
+
+		glBegin(GL_QUAD_STRIP);
+		for(j = 0; j <= longs; j++) {
+			btScalar lng = 2 * SIMD_PI * (btScalar) (j - 1) / longs;
+			btScalar x = cos(lng);
+			btScalar y = sin(lng);
+
+			glNormal3f(x * zr0, y * zr0, z0);
+			glVertex3f(x * zr0, y * zr0, z0);
+			glNormal3f(x * zr1, y * zr1, z1);
+			glVertex3f(x * zr1, y * zr1, z1);
+		}
+		glEnd();
+	}
+
+	glPopMatrix();
+}
+
+void GLDebugDrawer::drawBox (const btVector3& boxMin, const btVector3& boxMax, const btVector3& color, btScalar alpha)
+{
+	btVector3 halfExtent = (boxMax - boxMin) * btScalar(0.5f);
+	btVector3 center = (boxMax + boxMin) * btScalar(0.5f);
+	//glEnable(GL_BLEND);     // Turn blending On
+	//glBlendFunc(GL_SRC_ALPHA, GL_ONE);
+	glColor4f (color.getX(), color.getY(), color.getZ(), alpha);
+	glPushMatrix ();
+	glTranslatef (center.getX(), center.getY(), center.getZ());
+	glScaled(2*halfExtent[0], 2*halfExtent[1], 2*halfExtent[2]);
+//	glutSolidCube(1.0);
+	glPopMatrix ();
+	//glDisable(GL_BLEND);
+}
+
+void	GLDebugDrawer::drawTriangle(const btVector3& a,const btVector3& b,const btVector3& c,const btVector3& color,btScalar alpha)
+{
+//	if (m_debugMode > 0)
+	{
+		const btVector3	n=btCross(b-a,c-a).normalized();
+		glBegin(GL_TRIANGLES);		
+		glColor4f(color.getX(), color.getY(), color.getZ(),alpha);
+		glNormal3d(n.getX(),n.getY(),n.getZ());
+		glVertex3d(a.getX(),a.getY(),a.getZ());
+		glVertex3d(b.getX(),b.getY(),b.getZ());
+		glVertex3d(c.getX(),c.getY(),c.getZ());
+		glEnd();
+	}
+}
+
+void	GLDebugDrawer::setDebugMode(int debugMode)
+{
+	m_debugMode = debugMode;
+
+}
+
+void	GLDebugDrawer::draw3dText(const btVector3& location,const char* textString)
+{
+	glRasterPos3f(location.x(),  location.y(),  location.z());
+	//BMF_DrawString(BMF_GetFont(BMF_kHelvetica10),textString);
+}
+
+void	GLDebugDrawer::reportErrorWarning(const char* warningString)
+{
+	printf("%s\n",warningString);
+}
+
+void	GLDebugDrawer::drawContactPoint(const btVector3& pointOnB,const btVector3& normalOnB,btScalar distance,int lifeTime,const btVector3& color)
+{
+	
+	{
+		btVector3 to=pointOnB+normalOnB*1;//distance;
+		const btVector3&from = pointOnB;
+		glColor4f(color.getX(), color.getY(), color.getZ(),1.f);
+		//glColor4f(0,0,0,1.f);
+		glBegin(GL_LINES);
+		glVertex3d(from.getX(), from.getY(), from.getZ());
+		glVertex3d(to.getX(), to.getY(), to.getZ());
+		glEnd();
+
+		
+//		glRasterPos3f(from.x(),  from.y(),  from.z());
+//		char buf[12];
+//		sprintf(buf," %d",lifeTime);
+		//BMF_DrawString(BMF_GetFont(BMF_kHelvetica10),buf);
+
+
+	}
+}
+
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.h
@@ -0,0 +1,38 @@
+#ifndef GL_DEBUG_DRAWER_H
+#define GL_DEBUG_DRAWER_H
+
+#include "LinearMath/btIDebugDraw.h"
+
+
+
+class GLDebugDrawer : public btIDebugDraw
+{
+	int m_debugMode;
+
+public:
+
+	GLDebugDrawer();
+
+
+	virtual void	drawLine(const btVector3& from,const btVector3& to,const btVector3& fromColor, const btVector3& toColor);
+
+	virtual void	drawLine(const btVector3& from,const btVector3& to,const btVector3& color);
+
+	virtual void	drawSphere (const btVector3& p, btScalar radius, const btVector3& color);
+	virtual void	drawBox (const btVector3& boxMin, const btVector3& boxMax, const btVector3& color, btScalar alpha);
+
+	virtual void	drawTriangle(const btVector3& a,const btVector3& b,const btVector3& c,const btVector3& color,btScalar alpha);
+	
+	virtual void	drawContactPoint(const btVector3& PointOnB,const btVector3& normalOnB,btScalar distance,int lifeTime,const btVector3& color);
+
+	virtual void	reportErrorWarning(const char* warningString);
+
+	virtual void	draw3dText(const btVector3& location,const char* textString);
+
+	virtual void	setDebugMode(int debugMode);
+
+	virtual int		getDebugMode() const { return m_debugMode;}
+
+};
+
+#endif//GL_DEBUG_DRAWER_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.h
@@ -0,0 +1,29 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef BT_DEBUG_FONT_H
+#define BT_DEBUG_FONT_H
+
+#include "LinearMath/btVector3.h"
+
+
+void	GLDebugDrawStringInternal(int x,int y,const char* string,const btVector3& rgb, bool enableBlend, int spacing);
+void	GLDebugDrawStringInternal(int x,int y,const char* string,const btVector3& rgb);
+void	GLDebugDrawString(int x,int y,const char* string);
+void	GLDebugResetFont(int screenWidth,int screenHeight);
+
+#endif //BT_DEBUG_FONT_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.h
@@ -0,0 +1,70 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef GL_SHAPE_DRAWER_H
+#define GL_SHAPE_DRAWER_H
+
+class btCollisionShape;
+class btShapeHull;
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btVector3.h"
+
+#include "BulletCollision/CollisionShapes/btShapeHull.h"
+
+/// OpenGL shape drawing
+class GL_ShapeDrawer
+{
+protected:
+	struct ShapeCache
+	{
+	struct Edge { btVector3 n[2];int v[2]; };
+	ShapeCache(btConvexShape* s) : m_shapehull(s) {}
+	btShapeHull					m_shapehull;
+	btAlignedObjectArray<Edge>	m_edges;
+	};
+	//clean-up memory of dynamically created shape hulls
+	btAlignedObjectArray<ShapeCache*>	m_shapecaches;
+	unsigned int						m_texturehandle;
+	bool								m_textureenabled;
+	bool								m_textureinitialized;
+	
+
+	ShapeCache*							cache(btConvexShape*);
+
+public:
+		GL_ShapeDrawer();
+
+		virtual ~GL_ShapeDrawer();
+
+		///drawOpenGL might allocate temporary memoty, stores pointer in shape userpointer
+		virtual void		drawOpenGL(btScalar* m, const btCollisionShape* shape, const btVector3& color,int	debugMode,const btVector3& worldBoundsMin,const btVector3& worldBoundsMax);
+		virtual void		drawShadow(btScalar* m, const btVector3& extrusion,const btCollisionShape* shape,const btVector3& worldBoundsMin,const btVector3& worldBoundsMax);
+		
+		bool		enableTexture(bool enable) { bool p=m_textureenabled;m_textureenabled=enable;return(p); }
+		bool		hasTextureEnabled() const
+		{
+			return m_textureenabled;
+		}
+		
+		static void		drawCylinder(float radius,float halfHeight, int upAxis);
+		void			drawSphere(btScalar r, int lats, int longs);
+		static void		drawCoordSystem();
+		
+};
+
+void OGL_displaylist_register_shape(btCollisionShape * shape);
+void OGL_displaylist_clean();
+
+#endif //GL_SHAPE_DRAWER_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.cpp
@@ -0,0 +1,76 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#include "GL_Simplex1to4.h"
+#include "BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h"
+#include "GL_ShapeDrawer.h"
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+#include <GL/gl.h>
+#endif
+#include "GlutStuff.h"
+#include "LinearMath/btTransform.h"
+
+GL_Simplex1to4::GL_Simplex1to4()
+:m_simplexSolver(0)
+{
+}
+
+///
+/// Debugging method calcClosest calculates the closest point to the origin, using m_simplexSolver
+///
+void	GL_Simplex1to4::calcClosest(btScalar* m)
+{
+	btTransform tr;
+	tr.setFromOpenGLMatrix(m);
+	
+
+
+			GL_ShapeDrawer::drawCoordSystem();
+			
+			if (m_simplexSolver)
+			{
+				m_simplexSolver->reset();
+				bool res;
+
+				btVector3 v;
+
+				for (int i=0;i<m_numVertices;i++)
+				{
+					v =  tr(m_vertices[i]);
+					m_simplexSolver->addVertex(v,v,btVector3(0.f,0.f,0.f));
+					res = m_simplexSolver->closest(v);
+				}
+
+				//draw v?
+				glDisable(GL_LIGHTING);
+				glBegin(GL_LINES);
+				btglColor3(1.f, 0.f, 0.f);
+				btglVertex3(0.f, 0.f, 0.f);
+				btglVertex3(v.x(),v.y(),v.z());
+				glEnd();
+				
+				glEnable(GL_LIGHTING);
+
+
+			}
+
+}
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.h
@@ -0,0 +1,40 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef GL_SIMPLEX_1TO4_H
+#define GL_SIMPLEX_1TO4_H
+
+#include "BulletCollision/CollisionShapes/btTetrahedronShape.h"
+
+#include "BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h"
+
+///GL_Simplex1to4 is a class to debug a Simplex Solver with 1 to 4 points. 
+///Can be used by GJK.
+class GL_Simplex1to4 : public btBU_Simplex1to4
+{
+	btSimplexSolverInterface*	m_simplexSolver;
+
+	public:
+
+	GL_Simplex1to4();
+
+	void	calcClosest(btScalar* m);
+
+	void	setSimplexSolver(btSimplexSolverInterface* simplexSolver) {
+		m_simplexSolver = simplexSolver;
+	}
+
+};
+
+#endif //GL_SIMPLEX_1TO4_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.cpp
@@ -0,0 +1,87 @@
+
+#ifndef _WINDOWS
+
+#include "GlutDemoApplication.h"
+
+#include "GlutStuff.h"
+
+#include "BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+
+void	GlutDemoApplication::updateModifierKeys()
+{
+	m_modifierKeys = 0;
+	if (glutGetModifiers() & GLUT_ACTIVE_ALT)
+		m_modifierKeys |= BT_ACTIVE_ALT;
+
+	if (glutGetModifiers() & GLUT_ACTIVE_CTRL)
+		m_modifierKeys |= BT_ACTIVE_CTRL;
+	
+	if (glutGetModifiers() & GLUT_ACTIVE_SHIFT)
+		m_modifierKeys |= BT_ACTIVE_SHIFT;
+}
+
+void GlutDemoApplication::specialKeyboard(int key, int x, int y)	
+{
+	(void)x;
+	(void)y;
+
+	switch (key) 
+	{
+	case GLUT_KEY_F1:
+		{
+
+			break;
+		}
+
+	case GLUT_KEY_F2:
+		{
+
+			break;
+		}
+
+
+	case GLUT_KEY_END:
+		{
+			int numObj = getDynamicsWorld()->getNumCollisionObjects();
+			if (numObj)
+			{
+				btCollisionObject* obj = getDynamicsWorld()->getCollisionObjectArray()[numObj-1];
+
+				getDynamicsWorld()->removeCollisionObject(obj);
+				btRigidBody* body = btRigidBody::upcast(obj);
+				if (body && body->getMotionState())
+				{
+					delete body->getMotionState();					
+				}
+				delete obj;
+
+
+			}
+			break;
+		}
+	case GLUT_KEY_LEFT : stepLeft(); break;
+	case GLUT_KEY_RIGHT : stepRight(); break;
+	case GLUT_KEY_UP : stepFront(); break;
+	case GLUT_KEY_DOWN : stepBack(); break;
+	case GLUT_KEY_PAGE_UP : zoomIn(); break;
+	case GLUT_KEY_PAGE_DOWN : zoomOut(); break;
+	case GLUT_KEY_HOME : toggleIdle(); break;
+	default:
+		//        std::cout << "unused (special) key : " << key << std::endl;
+		break;
+	}
+
+	glutPostRedisplay();
+
+}
+
+void GlutDemoApplication::swapBuffers()
+{
+	glutSwapBuffers();
+
+}
+
+#endif //_WINDOWS
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.h
@@ -0,0 +1,34 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef GLUT_DEMO_APPLICATION_H
+#define GLUT_DEMO_APPLICATION_H
+
+#include "DemoApplication.h"
+
+class GlutDemoApplication : public DemoApplication
+{
+public:
+	
+	void specialKeyboard(int key, int x, int y);
+
+	virtual void swapBuffers();
+
+	virtual	void	updateModifierKeys();
+
+};
+#endif //GLUT_DEMO_APPLICATION_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.cpp
@@ -0,0 +1,119 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _WINDOWS
+
+#include "DemoApplication.h"
+
+//glut is C code, this global gDemoApplication links glut to the C++ demo
+static DemoApplication* gDemoApplication = 0;
+
+
+#include "GlutStuff.h"
+
+static	void glutKeyboardCallback(unsigned char key, int x, int y)
+{
+	gDemoApplication->keyboardCallback(key,x,y);
+}
+
+static	void glutKeyboardUpCallback(unsigned char key, int x, int y)
+{
+  gDemoApplication->keyboardUpCallback(key,x,y);
+}
+
+static void glutSpecialKeyboardCallback(int key, int x, int y)
+{
+	gDemoApplication->specialKeyboard(key,x,y);
+}
+
+static void glutSpecialKeyboardUpCallback(int key, int x, int y)
+{
+	gDemoApplication->specialKeyboardUp(key,x,y);
+}
+
+
+static void glutReshapeCallback(int w, int h)
+{
+	gDemoApplication->reshape(w,h);
+}
+
+static void glutMoveAndDisplayCallback()
+{
+	gDemoApplication->moveAndDisplay();
+}
+
+static void glutMouseFuncCallback(int button, int state, int x, int y)
+{
+	gDemoApplication->mouseFunc(button,state,x,y);
+}
+
+
+static void	glutMotionFuncCallback(int x,int y)
+{
+	gDemoApplication->mouseMotionFunc(x,y);
+}
+
+
+static void glutDisplayCallback(void)
+{
+	gDemoApplication->displayCallback();
+}
+
+
+int glutmain(int argc, char **argv,int width,int height,const char* title,DemoApplication* demoApp) {
+    
+	gDemoApplication = demoApp;
+
+	glutInit(&argc, argv);
+    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH | GLUT_STENCIL);
+    glutInitWindowPosition(0, 0);
+    glutInitWindowSize(width, height);
+    glutCreateWindow(title);
+#ifdef BT_USE_FREEGLUT
+	glutSetOption (GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_GLUTMAINLOOP_RETURNS);
+#endif
+
+    gDemoApplication->myinit();
+
+	glutKeyboardFunc(glutKeyboardCallback);
+	glutKeyboardUpFunc(glutKeyboardUpCallback);
+	glutSpecialFunc(glutSpecialKeyboardCallback);
+	glutSpecialUpFunc(glutSpecialKeyboardUpCallback);
+
+	glutReshapeFunc(glutReshapeCallback);
+    //createMenu();
+	glutIdleFunc(glutMoveAndDisplayCallback);
+	glutMouseFunc(glutMouseFuncCallback);
+	glutPassiveMotionFunc(glutMotionFuncCallback);
+	glutMotionFunc(glutMotionFuncCallback);
+	glutDisplayFunc( glutDisplayCallback );
+
+	glutMoveAndDisplayCallback();
+
+//enable vsync to avoid tearing on Apple (todo: for Windows)
+
+#if defined(__APPLE__) && !defined (VMDMESA)
+int swap_interval = 1;
+CGLContextObj cgl_context = CGLGetCurrentContext();
+CGLSetParameter(cgl_context, kCGLCPSwapInterval, &swap_interval);
+#endif
+
+
+	
+    return 0;
+}
+
+
+#endif //_WINDOWS
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.h
@@ -0,0 +1,86 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2012 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef GLUT_STUFF_H
+#define GLUT_STUFF_H
+
+#ifdef _WIN32//for glut.h
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/OpenGL.h>
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#include <GLUT/glut.h>
+#else
+
+
+#ifdef _WINDOWS
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glu.h>
+#else
+#include <GL/gl.h>
+#include <GL/glut.h>
+#endif //_WINDOWS
+#endif //APPLE
+
+#ifdef _WINDOWS
+#define BT_ACTIVE_ALT   VK_LMENU
+
+#else
+#define BT_KEY_K 'k'
+#define BT_KEY_LEFT			GLUT_KEY_LEFT
+#define BT_KEY_RIGHT		GLUT_KEY_RIGHT
+#define BT_KEY_UP			GLUT_KEY_UP
+#define BT_KEY_DOWN			GLUT_KEY_DOWN
+#define	BT_KEY_F1			GLUT_KEY_F1
+#define	BT_KEY_F2			GLUT_KEY_F2
+#define	BT_KEY_F3			GLUT_KEY_F3
+#define	BT_KEY_F4			GLUT_KEY_F4
+#define	BT_KEY_F5			GLUT_KEY_F5
+#define BT_KEY_PAGEUP		GLUT_KEY_PAGE_UP
+#define BT_KEY_PAGEDOWN		GLUT_KEY_PAGE_DOWN
+#define BT_KEY_END			GLUT_KEY_END
+#define BT_KEY_HOME			GLUT_KEY_HOME
+#define BT_ACTIVE_ALT		GLUT_ACTIVE_ALT
+#define	BT_ACTIVE_CTRL		GLUT_ACTIVE_ALT
+#define BT_ACTIVE_SHIFT		GLUT_ACTIVE_SHIFT
+#endif
+
+#if BT_USE_FREEGLUT
+#include "GL/freeglut_ext.h" //to be able to return from glutMainLoop()
+#endif
+
+
+
+class DemoApplication;
+
+int glutmain(int argc, char **argv,int width,int height,const char* title,DemoApplication* demoApp);
+
+#if defined(BT_USE_DOUBLE_PRECISION)
+#define btglLoadMatrix glLoadMatrixd
+#define btglMultMatrix glMultMatrixd
+#define btglColor3 glColor3d
+#define btglVertex3 glVertex3d
+#else
+#define btglLoadMatrix glLoadMatrixf
+#define btglMultMatrix glMultMatrixf
+#define btglColor3 glColor3f
+#define btglVertex3 glVertex3d
+#endif
+
+#endif //GLUT_STUFF_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.cpp
@@ -0,0 +1,86 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "RenderTexture.h"
+#include <memory.h>
+
+
+renderTexture::renderTexture(int width,int height)
+:m_height(height),m_width(width)
+{
+	m_buffer = new unsigned char[m_width*m_height*4];
+	
+	//clear screen
+	memset(m_buffer,0,m_width*m_height*4);
+
+	//clear screen version 2
+	for (int x=0;x<m_width;x++)
+	{
+		for (int y=0;y<m_height;y++)
+		{
+			setPixel(x,y,btVector4(float(x),float(y),0.f,1.f));
+		}
+
+	}
+
+}
+
+void renderTexture::grapicalPrintf(char* str,	void* fontData, int rasterposx,int rasterposy)
+{
+	unsigned char c;
+	int x=0;
+	int xx=0;
+
+	while ((c = (unsigned char) *str++)) {
+		
+		x=xx;		
+		unsigned char* fontPtr = (unsigned char*) fontData;
+		char ch = c-32;
+
+		int sx=ch%16;
+		int sy=ch/16;
+		
+		
+		for (int i=sx*16;i<(sx*16+16);i++)
+		{
+			int y=0;
+			for (int j=sy*16;j<(sy*16+16);j++)
+			{
+				unsigned char packedColor = (fontPtr[i*3+255*256*3-(256*j)*3]);
+				//float colorf = packedColor ? 0.f : 1.f;
+				float colorf = packedColor/255.f;// ? 0.f : 1.f;
+				btVector4 rgba(colorf,colorf,colorf,1.f);
+				//if (colorf)
+				{
+					//setPixel(rasterposx+x,rasterposy+y,rgba);
+					addPixel(rasterposx+x,rasterposy+y,rgba);
+				}
+				//bit >>=1;
+				y++;
+			}
+			x++;
+		}
+		//xx+=16;
+		xx+=10;
+	}
+}
+
+renderTexture::~renderTexture()
+{
+	delete [] m_buffer;
+}
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.h
@@ -0,0 +1,73 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef RENDER_TEXTURE_H
+#define RENDER_TEXTURE_H
+
+#include "LinearMath/btVector3.h"
+#include "GLDebugFont.h"
+
+///
+///renderTexture provides a software-render context (setpixel/printf)
+///
+class renderTexture
+{
+	int m_height;
+	int m_width;
+	unsigned char*	m_buffer;
+
+public:
+
+	renderTexture(int width,int height);
+	~renderTexture();
+
+	///rgba input is in range [0..1] for each component
+	inline void	setPixel(int x,int y,const btVector4& rgba)
+	{
+		unsigned char* pixel = &m_buffer[ (x+y*m_width) * 4];
+
+		pixel[0] = (unsigned char)(255.*rgba.getX());
+		pixel[1] = (unsigned char)(255.*rgba.getY());
+		pixel[2] = (unsigned char)(255.*rgba.getZ());
+		pixel[3] = (unsigned char)(255.*rgba.getW());
+	}
+
+	inline void	addPixel(int x,int y,const btVector4& rgba)
+	{
+		unsigned char* pixel = &m_buffer[ (x+y*m_width) * 4];
+		pixel[0] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[0] + btScalar(255.f)*rgba.getX()));
+		pixel[1] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[1] + btScalar(255.f)*rgba.getY()));
+		pixel[2] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[2] + btScalar(255.f)*rgba.getZ()));
+//		pixel[3] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[3] + btScalar(255.f)*rgba.getW()));
+	}
+
+	inline btVector4 getPixel(int x,int y)
+	{
+		unsigned char* pixel = &m_buffer[ (x+y*m_width) * 4];
+		return btVector4(pixel[0]*1.f/255.f,
+			pixel[1]*1.f/255.f,
+			pixel[2]*1.f/255.f,
+			pixel[3]*1.f/255.f);
+	}
+
+	const unsigned char*	getBuffer() const { return m_buffer;}
+	int	getWidth() const { return m_width;}
+	int	getHeight() const { return m_height;}
+	void grapicalPrintf(char* str,	void* fontData, int startx = 0,int starty=0);
+
+};
+
+#endif //RENDER_TEXTURE_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32AppMain.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32AppMain.cpp
@@ -0,0 +1,405 @@
+#ifdef _WINDOWS
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2010 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <windows.h>
+#include <gl/gl.h>
+
+
+#include "DemoApplication.h"
+
+#include "GLDebugDrawer.h"
+#include "GLDebugFont.h"
+
+#include "BulletDynamics/Dynamics/btDynamicsWorld.h"
+
+/// This Win32AppMain is shared code between all demos. 
+/// The actual demo, derived from DemoApplication is created using 'createDemo', in a separate .cpp file
+DemoApplication* gDemoApplication = 0;
+DemoApplication*	createDemo();
+
+
+// Function Declarations
+
+LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
+void EnableOpenGL(HWND hWnd, HDC * hDC, HGLRC * hRC);
+void DisableOpenGL(HWND hWnd, HDC hDC, HGLRC hRC);
+static bool sOpenGLInitialized = false;
+static int sWidth = 0;
+static int sHeight =0;
+static int quitRequest = 0;
+
+// WinMain
+
+int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, 
+				   LPSTR lpCmdLine, int iCmdShow)
+{
+	WNDCLASS wc;
+	HWND hWnd;
+	HDC hDC;
+	HGLRC hRC;
+	MSG msg;
+	BOOL quit = FALSE;
+	float theta = 0.0f;
+	
+	gDemoApplication = createDemo();
+	
+
+	// register window class
+	wc.style = CS_OWNDC;
+	wc.lpfnWndProc = WndProc;
+	wc.cbClsExtra = 0;
+	wc.cbWndExtra = 0;
+	wc.hInstance = hInstance;
+	wc.hIcon = LoadIcon( NULL, IDI_APPLICATION );
+	wc.hCursor = LoadCursor( NULL, IDC_ARROW );
+	wc.hbrBackground = (HBRUSH)GetStockObject( BLACK_BRUSH );
+	wc.lpszMenuName = NULL;
+	wc.lpszClassName = "BulletPhysics";
+	RegisterClass( &wc );
+	
+	// create main window
+	hWnd = CreateWindow( 
+		"BulletPhysics", "Bullet Physics Sample. http://bulletphysics.org", 
+		WS_CAPTION | WS_VISIBLE | WS_OVERLAPPEDWINDOW,
+//		0, 0, 640, 480,
+		0, 0, 1024, 768,
+		NULL, NULL, hInstance, NULL );
+	
+	// enable OpenGL for the window
+	EnableOpenGL( hWnd, &hDC, &hRC );
+	
+	
+	GLDebugDrawer debugDraw;
+	gDemoApplication->myinit();
+	//gDemoApplication->reshape(1024, 768);
+	gDemoApplication->initPhysics();
+	if (gDemoApplication->getDynamicsWorld())
+		gDemoApplication->getDynamicsWorld()->setDebugDrawer(&debugDraw);
+	
+	gDemoApplication->reshape(sWidth,sHeight);
+
+	// program main loop
+	while ( !quit )
+	{
+		
+		// check for messages
+		if ( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE )  )
+		{
+			
+			// handle or dispatch messages
+			if ( msg.message == WM_QUIT ) 
+			{
+				quit = TRUE;
+			} 
+			else 
+			{
+				TranslateMessage( &msg );
+				DispatchMessage( &msg );
+			}
+			
+//			gDemoApplication->displayCallback();
+			
+
+		};
+		
+		// OpenGL animation code goes here
+		
+		glClearColor( .7f, 0.7f, 0.7f, 1.f );
+		
+		gDemoApplication->moveAndDisplay();
+
+
+		SwapBuffers( hDC );
+		
+		theta += 1.0f;
+	
+		
+	}
+	
+
+
+	// shutdown OpenGL
+	DisableOpenGL( hWnd, hDC, hRC );
+	
+	// destroy the window explicitly
+	DestroyWindow( hWnd );
+
+	delete gDemoApplication;
+
+	return msg.wParam;
+	
+}
+
+// Window Procedure
+
+LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	
+	
+
+	switch (message)
+	{
+
+	case WM_SYSKEYDOWN:
+		{
+			if (lParam & 1<<29)
+			{
+				gDemoApplication->m_modifierKeys = VK_LMENU;
+			}
+			break;
+		}
+	case WM_SYSKEYUP:
+		{
+			if (lParam & 1<<29)
+			{
+				gDemoApplication->m_modifierKeys = VK_LMENU;
+			} else
+			{
+				gDemoApplication->m_modifierKeys = 0;
+			}
+			
+			break;
+		}
+
+		
+		case WM_SIZE:													// Size Action Has Taken Place
+
+			switch (wParam)												// Evaluate Size Action
+			{
+				case SIZE_MINIMIZED:									// Was Window Minimized?
+				return 0;												// Return
+
+				case SIZE_MAXIMIZED:									// Was Window Maximized?
+					sWidth = LOWORD (lParam);
+					sHeight = HIWORD (lParam);
+					if (sOpenGLInitialized)
+					{
+						gDemoApplication->reshape(sWidth,sHeight);
+					}
+				return 0;												// Return
+
+				case SIZE_RESTORED:										// Was Window Restored?
+					sWidth = LOWORD (lParam);
+					sHeight = HIWORD (lParam);
+					if (sOpenGLInitialized)
+					{
+						gDemoApplication->reshape(sWidth,sHeight);
+					}
+				return 0;												// Return
+			}
+		break;	
+
+	case WM_CREATE:
+		return 0;
+	
+	case WM_MBUTTONUP:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(1,1,xPos,yPos);
+		break;
+	}
+	case WM_MBUTTONDOWN:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(1,0,xPos,yPos);
+		break;
+	}
+
+	case WM_LBUTTONUP:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(0,1,xPos,yPos);
+		break;
+	}
+	case 0x020A://WM_MOUSEWHEEL:
+	{
+
+		int  zDelta = (short)HIWORD(wParam);
+		int xPos = LOWORD(lParam); 
+		int yPos = HIWORD(lParam); 
+		if (zDelta>0)
+			gDemoApplication->zoomIn();
+		else
+			gDemoApplication->zoomOut();
+		break;
+	}
+
+	case WM_MOUSEMOVE:
+		{
+				int xPos = LOWORD(lParam); 
+				int yPos = HIWORD(lParam); 
+				gDemoApplication->mouseMotionFunc(xPos,yPos);
+			break;
+		}
+	case WM_RBUTTONUP:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(2,1,xPos,yPos);
+		break;
+	}
+	case WM_RBUTTONDOWN:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(2,0,xPos,yPos);
+		break;
+	}
+	case WM_LBUTTONDOWN:
+		{
+				int xPos = LOWORD(lParam); 
+				int yPos = HIWORD(lParam); 
+				gDemoApplication->mouseFunc(0,0,xPos,yPos);
+			break;
+		}
+/*#define WM_LBUTTONUP                    0x0202
+#define WM_LBUTTONDBLCLK                0x0203
+#define WM_RBUTTONDOWN                  0x0204
+#define WM_RBUTTONUP                    0x0205
+#define WM_RBUTTONDBLCLK                0x0206
+#define WM_MBUTTONDOWN                  0x0207
+#define WM_MBUTTONUP                    0x0208
+#define WM_MBUTTONDBLCLK                0x0209
+*/
+
+
+
+	case WM_CLOSE:
+		PostQuitMessage( 0 );
+		return 0;
+		
+	case WM_DESTROY:
+		return 0;
+		
+	case WM_KEYUP:
+		switch ( wParam )
+		{
+			
+		case VK_PRIOR:
+		case VK_NEXT:
+		case VK_END:
+		case VK_HOME:
+		case VK_LEFT:
+		case VK_UP:
+		case VK_RIGHT:
+		case VK_DOWN:
+			{
+				if (gDemoApplication)
+					gDemoApplication->specialKeyboardUp(wParam,0,0);
+				return 0;
+			}
+			default:
+				{
+					gDemoApplication->keyboardUpCallback(tolower(wParam),0,0);
+				}
+			return DefWindowProc( hWnd, message, wParam, lParam );
+		}
+
+	case WM_KEYDOWN:
+		printf("bla\n");
+		switch ( wParam )
+		{
+		case VK_CONTROL:
+		case VK_PRIOR:
+		case VK_NEXT:
+		case VK_END:
+		case VK_HOME:
+		case VK_LEFT:
+		case VK_UP:
+		case VK_RIGHT:
+		case VK_DOWN:
+			{
+				if (gDemoApplication)
+					gDemoApplication->specialKeyboard(wParam,0,0);
+				break;
+			}
+
+		case ' ':
+			{
+				if (gDemoApplication)
+					gDemoApplication->clientResetScene();
+				break;
+			}
+		case 'Q':
+		case VK_ESCAPE:
+			{
+				quitRequest = 1;
+				PostQuitMessage(0);
+			}
+			return 0;
+			
+		}
+		return 0;
+		
+	case WM_CHAR:
+		if (!quitRequest)
+			gDemoApplication->keyboardCallback(wParam,0,0);
+		break;
+	
+	default:
+		return DefWindowProc( hWnd, message, wParam, lParam );
+			
+	}
+	return 0;
+}
+
+// Enable OpenGL
+
+void EnableOpenGL(HWND hWnd, HDC * hDC, HGLRC * hRC)
+{
+	PIXELFORMATDESCRIPTOR pfd;
+	int format;
+	
+	// get the device context (DC)
+	*hDC = GetDC( hWnd );
+	
+	// set the pixel format for the DC
+	ZeroMemory( &pfd, sizeof( pfd ) );
+	pfd.nSize = sizeof( pfd );
+	pfd.nVersion = 1;
+	pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
+	pfd.iPixelType = PFD_TYPE_RGBA;
+	pfd.cColorBits = 24;
+	pfd.cDepthBits = 16;
+	pfd.cStencilBits = 1;
+	pfd.iLayerType = PFD_MAIN_PLANE;
+	format = ChoosePixelFormat( *hDC, &pfd );
+	SetPixelFormat( *hDC, format, &pfd );
+	
+	// create and enable the render context (RC)
+	*hRC = wglCreateContext( *hDC );
+	wglMakeCurrent( *hDC, *hRC );
+	sOpenGLInitialized = true;
+	
+	
+}
+
+// Disable OpenGL
+
+void DisableOpenGL(HWND hWnd, HDC hDC, HGLRC hRC)
+{
+	sOpenGLInitialized = false;
+
+	wglMakeCurrent( NULL, NULL );
+	wglDeleteContext( hRC );
+	ReleaseDC( hWnd, hDC );
+}
+
+#endif //_WINDOWS
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.cpp
@@ -0,0 +1,79 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifdef _WINDOWS
+
+#include "Win32DemoApplication.h"
+
+
+
+
+#if 0
+void	Win32DemoApplication::renderme()
+{
+}
+void	Win32DemoApplication::setTexturing(bool useTexture)
+{
+}
+	
+void	Win32DemoApplication::setShadows(bool useShadows)
+{
+}
+	
+void	Win32DemoApplication::setCameraDistance(float camDist)
+{
+}
+void	Win32DemoApplication::clientResetScene()
+{
+
+}
+#endif
+
+void Win32DemoApplication::updateModifierKeys()
+{
+	//not yet
+}
+
+
+
+void Win32DemoApplication::specialKeyboard(int key, int x, int y)	
+{
+	(void)x;
+	(void)y;
+
+	switch (key) 
+	{
+	case VK_LEFT : stepLeft(); break;
+	case VK_RIGHT : stepRight(); break;
+	case VK_UP : stepFront(); break;
+	case VK_DOWN : stepBack(); break;
+
+//	case GLUT_KEY_PAGE_UP : zoomIn(); break;
+//	case GLUT_KEY_PAGE_DOWN : zoomOut(); break;
+//	case GLUT_KEY_HOME : toggleIdle(); break;
+
+	default:
+		//        std::cout << "unused (special) key : " << key << std::endl;
+		break;
+	}
+
+}
+
+void	Win32DemoApplication::swapBuffers()
+{
+}
+	
+#endif
+	
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.h
@@ -0,0 +1,40 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef WIN32_DEMO_APPLICATION_H
+#define WIN32_DEMO_APPLICATION_H
+
+
+#include "DemoApplication.h"
+
+class Win32DemoApplication : public DemoApplication
+{
+protected:
+
+
+public:
+
+	
+	virtual void	swapBuffers();
+		
+	void specialKeyboard(int key, int x, int y);
+
+	virtual		void	updateModifierKeys();
+
+	
+};
+
+#endif //WIN32_DEMO_APPLICATION_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/premake4.lua
@@ -0,0 +1,18 @@
+	project "testbed"
+		
+	kind "StaticLib"
+	targetdir "../../build/lib"	
+	includedirs {
+		".",
+		"../../bullet2"
+	}
+	configuration {"Windows"}
+	includedirs {
+		"../../rendering/GlutGlewWindows"
+	}
+	configuration{}
+
+	files {
+		"**.cpp",
+		"**.h"
+	}
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/AMD/premake4.lua
@@ -0,0 +1,29 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_bt3dGridBroadphase_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "StaticLib"
+		targetdir "../../../bin"
+
+		libdirs {"../../../rendering/GlutGlewWindows"}
+
+			includedirs {
+--		"../../../rendering/GlutGlewWindows",
+		"../../../opencl/3dGridBroadphase/Shared",
+		"../../../../../src",
+		"../../primitives"
+		}
+		
+		files {
+			"../Shared/*.cpp",
+			"../Shared/*.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/MiniCL/MiniCLTaskWrap.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/MiniCL/MiniCLTaskWrap.cpp
@@ -0,0 +1,23 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <MiniCL/cl_MiniCL_Defs.h>
+
+extern "C"
+{
+	#define MSTRINGIFY(A) A
+	#include "bt3dGridBroadphaseOCL.cl"
+	#undef MSTRINGIFY
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cl
@@ -0,0 +1,349 @@
+
+MSTRINGIFY(
+
+int getPosHash(int4 gridPos, __global float4* pParams)
+{
+	int4 gridDim = *((__global int4*)(pParams + 1));
+	gridPos.x &= gridDim.x - 1;
+	gridPos.y &= gridDim.y - 1;
+	gridPos.z &= gridDim.z - 1;
+	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
+	return hash;
+} 
+
+int4 getGridPos(float4 worldPos, __global float4* pParams)
+{
+    int4 gridPos;
+	int4 gridDim = *((__global int4*)(pParams + 1));
+    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);
+    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);
+    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);
+    return gridPos;
+}
+
+
+// calculate grid hash value for each body using its AABB
+__kernel void kCalcHashAABB(int numObjects, __global float4* pAABB, __global int2* pHash, __global float4* pParams GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index*2];
+	float4 bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+	pos.w = 0.f;
+    // get address in grid
+    int4 gridPos = getGridPos(pos, pParams);
+    int gridHash = getPosHash(gridPos, pParams);
+    // store grid hash and body index
+    int2 hashVal;
+    hashVal.x = gridHash;
+    hashVal.y = index;
+    pHash[index] = hashVal;
+}
+
+__kernel void kClearCellStart(	int numCells, 
+								__global int* pCellStart GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numCells)
+	{
+		return;
+	}
+	pCellStart[index] = -1;
+}
+
+__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart GUID_ARG)
+{
+	__local int sharedHash[513];
+    int index = get_global_id(0);
+	int2 sortedData;
+    if(index < numObjects)
+	{
+		sortedData = pHash[index];
+		// Load hash data into shared memory so that we can look 
+		// at neighboring body's hash value without loading
+		// two hash values per thread
+		sharedHash[get_local_id(0) + 1] = sortedData.x;
+		if((index > 0) && (get_local_id(0) == 0))
+		{
+			// first thread in block must load neighbor body hash
+			sharedHash[0] = pHash[index-1].x;
+		}
+	}
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(index < numObjects)
+	{
+		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
+		{
+			cellStart[sortedData.x] = index;
+		}
+	}
+}
+
+int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
+{
+	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && 
+			(min0.y <= max1.y)&& (min1.y <= max0.y) && 
+			(min0.z <= max1.z)&& (min1.z <= max0.z); 
+}
+
+
+
+
+
+void findPairsInCell(	int numObjects,
+						int4	gridPos,
+						int    index,
+						__global int2*  pHash,
+						__global int*   pCellStart,
+						__global float4* pAABB, 
+						__global int*   pPairBuff,
+						__global int2*	pPairBuffStartCurr,
+						__global float4* pParams)
+{
+	int4 pGridDim = *((__global int4*)(pParams + 1));
+	int maxBodiesPerCell = pGridDim.w;
+    int gridHash = getPosHash(gridPos, pParams);
+    // get start of bucket for this cell
+    int bucketStart = pCellStart[gridHash];
+    if (bucketStart == -1)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+    float4 min0 = pAABB[unsorted_indx*2 + 0]; 
+	float4 max0 = pAABB[unsorted_indx*2 + 1];
+	int handleIndex =  as_int(min0.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	int curr_max = start_curr_next.x - start - 1;
+	int bucketEnd = bucketStart + maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;
+	for(int index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        int2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		int unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			float4 min1 = pAABB[unsorted_indx2*2 + 0];
+			float4 max1 = pAABB[unsorted_indx2*2 + 1];
+			if(testAABBOverlap(min0, max0, min1, max1))
+			{
+				int handleIndex2 = as_int(min1.w);
+				int k;
+				for(k = 0; k < curr; k++)
+				{
+					int old_pair = pPairBuff[start+k] & (~0x60000000);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= 0x40000000;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					pPairBuff[start+curr] = handleIndex2 | 0x20000000;
+					curr++;
+				}
+			}
+		}
+	}
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = curr;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+    return;
+}
+
+__kernel void kFindOverlappingPairs(	int numObjects,
+										__global float4* pAABB, 
+										__global int2* pHash, 
+										__global int* pCellStart, 
+										__global int* pPairBuff, 
+										__global int2* pPairBuffStartCurr, 
+										__global float4* pParams GUID_ARG)
+
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+	float4 bbMin = pAABB[unsorted_indx*2 + 0];
+	float4 bbMax = pAABB[unsorted_indx*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+    // get address in grid
+    int4 gridPosA = getGridPos(pos, pParams);
+    int4 gridPosB; 
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) 
+    {
+		gridPosB.z = gridPosA.z + z;
+        for(int y=-1; y<=1; y++) 
+        {
+			gridPosB.y = gridPosA.y + y;
+            for(int x=-1; x<=1; x++) 
+            {
+				gridPosB.x = gridPosA.x + x;
+                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams);
+            }
+        }
+    }
+}
+
+
+__kernel void kFindPairsLarge(	int numObjects, 
+								__global float4* pAABB, 
+								__global int2* pHash, 
+								__global int* pCellStart, 
+								__global int* pPairBuff, 
+								__global int2* pPairBuffStartCurr, 
+								uint numLarge GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+	float4 min0 = pAABB[unsorted_indx*2 + 0];
+	float4 max0 = pAABB[unsorted_indx*2 + 1];
+	int handleIndex =  as_int(min0.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	int curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		int indx2 = numObjects + i;
+		float4 min1 = pAABB[indx2*2 + 0];
+		float4 max1 = pAABB[indx2*2 + 1];
+		if(testAABBOverlap(min0, max0, min1, max1))
+		{
+			int k;
+			int handleIndex2 =  as_int(min1.w);
+			for(k = 0; k < curr; k++)
+			{
+				int old_pair = pPairBuff[start+k] & (~0x60000000);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= 0x40000000;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | 0x20000000;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = curr;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+    return;
+}
+
+__kernel void kComputePairCacheChanges(	int numObjects,
+										__global int* pPairBuff, 
+										__global int2* pPairBuffStartCurr, 
+										__global int* pPairScan, 
+										__global float4* pAABB GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int *pInp = pPairBuff + start;
+	int num_changes = 0;
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & 0x40000000))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} 
+
+__kernel void kSqueezeOverlappingPairBuff(	int numObjects,
+											__global int* pPairBuff, 
+											__global int2* pPairBuffStartCurr, 
+											__global int* pPairScan,
+											__global int* pPairOut, 
+											__global float4* pAABB GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int* pInp = pPairBuff + start;
+	__global int* pOut = pPairOut + pPairScan[index+1];
+	__global int* pOut2 = pInp;
+	int num = 0; 
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & 0x40000000))
+		{
+			*pOut = *pInp;
+			pOut++;
+		}
+		if((*pInp) & 0x60000000)
+		{
+			*pOut2 = (*pInp) & (~0x60000000);
+			pOut2++;
+			num++;
+		}
+	}
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = num;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+}
+
+
+
+
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
@@ -0,0 +1,697 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "../basic_initialize/btOpenCLUtils.h"
+
+#include "bt3dGridBroadphaseOCL.h"
+
+#include <stdio.h>
+#include <string.h>
+#include "Adl/Adl.h"
+#include <AdlPrimitives/Scan/PrefixScan.h>
+#include <AdlPrimitives/Sort/RadixSort32.h>
+#include <AdlPrimitives/Sort/RadixSort.h>
+
+#define ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+
+#define GRID_OCL_PATH "..\\..\\opencl\\3dGridBroadphase\\Shared\\bt3dGridBroadphaseOCL.cl"
+
+
+#define MSTRINGIFY(A) #A
+
+static const char* spProgramSource = 
+#include "bt3dGridBroadphaseOCL.cl"
+
+adl::PrefixScan<adl::TYPE_CL>::Data* gData1=0;
+adl::Buffer<unsigned int>* m_srcClBuffer=0;
+
+struct MySortData
+{
+	int key;
+	int value;
+};
+
+adl::RadixSort32<adl::TYPE_CL>::Data* dataC = 0;
+adl::RadixSort<adl::TYPE_HOST>::Data* dataHost = 0;
+
+
+static unsigned int infElem = 0x2fffffff;
+
+static unsigned int zeroEl = 0;
+static unsigned int minusOne= -1;
+
+
+bt3dGridBroadphaseOCL::bt3dGridBroadphaseOCL(	btOverlappingPairCache* overlappingPairCache,
+												const btVector3& cellSize, 
+												int gridSizeX, int gridSizeY, int gridSizeZ, 
+												int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+												btScalar maxSmallProxySize,
+												int maxSmallProxiesPerCell,
+												cl_context context, cl_device_id device, cl_command_queue queue,
+												adl::DeviceCL* deviceCL
+												) : 
+	btGpu3DGridBroadphase(overlappingPairCache, cellSize, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerSmallProxy, maxSmallProxySize, maxSmallProxiesPerCell)
+{
+
+
+	initCL(context, device, queue);
+	allocateBuffers();
+	
+	prefillBuffers();
+
+	initKernels();
+
+	//create an Adl device host and OpenCL device
+
+	adl::DeviceUtils::Config cfg;
+	m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
+	m_ownsDevice = false;
+	if (!deviceCL)
+	{
+		m_ownsDevice = true;
+		deviceCL = new adl::DeviceCL;
+		deviceCL->m_context = context;
+		deviceCL->m_deviceIdx = device;
+		deviceCL->m_commandQueue = queue;
+		deviceCL->m_kernelManager = new adl::KernelManager;
+	}
+
+	m_deviceCL = deviceCL;
+
+	int minSize = 256*1024;
+	int maxSortBuffer = maxSmallProxies < minSize ? minSize :maxSmallProxies;
+
+	m_srcClBuffer = new adl::Buffer<unsigned int> (m_deviceCL,maxSmallProxies+2);
+	m_srcClBuffer->write(&zeroEl,1,0);
+
+	//m_srcClBuffer->write(&infElem,maxSmallProxies,0);
+	m_srcClBuffer->write(&infElem,1,maxSmallProxies);
+	m_srcClBuffer->write(&zeroEl,1,maxSmallProxies+1);
+	m_deviceCL->waitForCompletion();
+	
+	gData1 = adl::PrefixScan<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2,adl::PrefixScanBase::EXCLUSIVE );
+	dataHost = adl::RadixSort<adl::TYPE_HOST>::allocate( m_deviceHost, maxSmallProxies+2 );
+	dataC = adl::RadixSort32<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2 );
+	
+}
+
+
+
+bt3dGridBroadphaseOCL::~bt3dGridBroadphaseOCL()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	adl::RadixSort<adl::TYPE_HOST>::deallocate(dataHost);
+	adl::PrefixScan<adl::TYPE_CL>::deallocate(gData1);
+	adl::RadixSort32<adl::TYPE_CL>::deallocate(dataC);
+	adl::DeviceUtils::deallocate(m_deviceHost);
+	delete m_srcClBuffer;
+	if (m_ownsDevice)
+	{
+		delete m_deviceCL->m_kernelManager;
+		delete m_deviceCL;
+	}
+}
+
+#ifdef CL_PLATFORM_MINI_CL
+// there is a problem with MSVC9 : static constructors are not called if variables defined in library and are not used
+// looks like it is because of optimization
+// probably this will happen with other compilers as well
+// so to make it robust, register kernels again (it is safe)
+#define MINICL_DECLARE(a) extern "C" void a();
+MINICL_DECLARE(kCalcHashAABB)
+MINICL_DECLARE(kClearCellStart)
+MINICL_DECLARE(kFindCellStart)
+MINICL_DECLARE(kFindOverlappingPairs)
+MINICL_DECLARE(kFindPairsLarge)
+MINICL_DECLARE(kComputePairCacheChanges)
+MINICL_DECLARE(kSqueezeOverlappingPairBuff)
+#undef MINICL_DECLARE
+#endif
+
+void bt3dGridBroadphaseOCL::initCL(cl_context context, cl_device_id device, cl_command_queue queue)
+{
+
+	#ifdef CL_PLATFORM_MINI_CL
+		// call constructors here
+		MINICL_REGISTER(kCalcHashAABB)
+		MINICL_REGISTER(kClearCellStart)
+		MINICL_REGISTER(kFindCellStart)
+		MINICL_REGISTER(kFindOverlappingPairs)
+		MINICL_REGISTER(kFindPairsLarge)
+		MINICL_REGISTER(kComputePairCacheChanges)
+		MINICL_REGISTER(kSqueezeOverlappingPairBuff)
+	#endif
+
+	cl_int ciErrNum;
+
+	btAssert(context);
+	m_cxMainContext = context;
+	btAssert(device);
+	m_cdDevice = device;
+	btAssert(queue);
+	m_cqCommandQue = queue;
+	
+	//adl::Kernel kern = m_deviceCL->getKernel(fileName,funcName,options,src);
+	
+	m_cpProgram = btOpenCLUtils::compileCLProgramFromString(m_cxMainContext,m_cdDevice,spProgramSource, &ciErrNum,"-DGUID_ARG=""""",GRID_OCL_PATH);
+	
+	printf("OK\n");
+}
+
+
+void bt3dGridBroadphaseOCL::initKernels()
+{
+	initKernel(GRID3DOCL_KERNEL_CALC_HASH_AABB,	"kCalcHashAABB");
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 3, sizeof(cl_mem),(void*)&m_dBpParams);
+
+	initKernel(GRID3DOCL_KERNEL_CLEAR_CELL_START, "kClearCellStart");
+	setKernelArg(GRID3DOCL_KERNEL_CLEAR_CELL_START, 1, sizeof(cl_mem),(void*)&m_dCellStart);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_CELL_START, "kFindCellStart");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 1, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 2, sizeof(cl_mem),(void*)&m_dCellStart);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, "kFindOverlappingPairs");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 3, sizeof(cl_mem),(void*)&m_dCellStart);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 6, sizeof(cl_mem),(void*)&m_dBpParams);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, "kFindPairsLarge");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 3, sizeof(cl_mem),(void*)&m_dCellStart);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+
+	initKernel(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, "kComputePairCacheChanges");
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 4, sizeof(cl_mem),(void*)&m_dAABB);
+
+	initKernel(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, "kSqueezeOverlappingPairBuff");
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 4, sizeof(cl_mem),(void*)&m_dPairsChanged);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 5, sizeof(cl_mem),(void*)&m_dAABB);
+
+}
+
+
+void bt3dGridBroadphaseOCL::allocateBuffers()
+{
+    cl_int ciErrNum;
+    unsigned int memSize;
+	// current version of bitonic sort works for power of 2 arrays only, so ...
+	m_hashSize = 1;
+	for(int bit = 1; bit < 32; bit++)
+	{
+		if(m_hashSize >= m_maxHandles)
+		{
+			break;
+		}
+		m_hashSize <<= 1;
+	}
+	memSize = m_hashSize * 2 * sizeof(unsigned int);
+	if (memSize < 1024*1024)
+		memSize = 1024*1024;
+
+	m_dBodiesHash = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_numCells * sizeof(unsigned int);
+	m_dCellStart = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
+	m_dPairBuff = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = (m_maxHandles * 2 + 1) * sizeof(unsigned int);
+	m_dPairBuffStartCurr = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	memSize = numAABB * sizeof(float) * 4 * 2;
+	m_dAABB = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = (m_maxHandles + 2) * sizeof(unsigned int);
+	m_dPairScanChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
+	m_dPairsChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	m_dPairsContiguous = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = 3 * 4 * sizeof(float);
+	m_dBpParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+void bt3dGridBroadphaseOCL::prefillBuffers()
+{
+	memset(m_hBodiesHash, 0xFF, m_maxHandles*2*sizeof(unsigned int));
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_maxHandles * 2 * sizeof(unsigned int));
+	// now fill the rest (bitonic sorting works with size == pow of 2)
+	int remainder = m_hashSize - m_maxHandles;
+	if(remainder)
+	{
+		copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, remainder * 2 * sizeof(unsigned int), m_maxHandles * 2 * sizeof(unsigned int), 0);
+	}
+	copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); 
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+	copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+}
+
+
+void bt3dGridBroadphaseOCL::initKernel(int kernelId, char* pName)
+{
+	
+	cl_int ciErrNum;
+	cl_kernel kernel = clCreateKernel(m_cpProgram, pName, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	size_t wgSize;
+	ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	m_kernels[kernelId].m_Id = kernelId;
+	m_kernels[kernelId].m_kernel = kernel;
+	m_kernels[kernelId].m_name = pName;
+	m_kernels[kernelId].m_workgroupSize = (int)wgSize;
+	return;
+}
+
+void bt3dGridBroadphaseOCL::runKernelWithWorkgroupSize(int kernelId, int globalSize)
+{
+	if(globalSize <= 0)
+	{
+		return;
+	}
+	cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;
+	cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	int workgroupSize = btMin(64,m_kernels[kernelId].m_workgroupSize);
+
+	if(workgroupSize <= 0)
+	{ // let OpenCL library calculate workgroup size
+		size_t globalWorkSize[2];
+		globalWorkSize[0] = globalSize;
+		globalWorkSize[1] = 1;
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );
+	}
+	else
+	{
+		size_t localWorkSize[2], globalWorkSize[2];
+		//workgroupSize = btMin(workgroupSize, globalSize);
+		int num_t = globalSize / workgroupSize;
+		int num_g = num_t * workgroupSize;
+		if(num_g < globalSize)
+		{
+			num_t++;
+		}
+		localWorkSize[0]  = workgroupSize;
+		globalWorkSize[0] = num_t * workgroupSize;
+		localWorkSize[1] = 1;
+		globalWorkSize[1] = 1;
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );
+	}
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	ciErrNum = clFlush(m_cqCommandQue);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+
+void bt3dGridBroadphaseOCL::setKernelArg(int kernelId, int argNum, int argSize, void* argPtr)
+{
+    cl_int ciErrNum;
+	ciErrNum  = clSetKernelArg(m_kernels[kernelId].m_kernel, argNum, argSize, argPtr);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+
+void bt3dGridBroadphaseOCL::copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs, int hostOffs)
+{
+	if (size)
+	{
+		cl_int ciErrNum;
+		char* pHost = (char*)host + hostOffs;
+		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
+		GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+}
+
+void bt3dGridBroadphaseOCL::copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs, int devOffs)
+{
+	if (size)
+    {
+		cl_int ciErrNum;
+		char* pHost = (char*)host + hostOffs;
+		ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
+		GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+}
+
+
+
+//
+// overrides
+//
+
+
+void bt3dGridBroadphaseOCL::prepareAABB()
+{
+	btGpu3DGridBroadphase::prepareAABB();
+	copyArrayToDevice(m_dAABB, m_hAABB, sizeof(bt3DGrid3F1U) * 2 * (m_numHandles + m_numLargeHandles)); 
+	return;
+}
+
+void bt3dGridBroadphaseOCL::setParameters(bt3DGridBroadphaseParams* hostParams)
+{
+	btGpu3DGridBroadphase::setParameters(hostParams);
+	struct btParamsBpOCL
+	{
+		float m_invCellSize[4];
+		int   m_gridSize[4];
+	};
+	btParamsBpOCL hParams;
+	hParams.m_invCellSize[0] = m_params.m_invCellSizeX;
+	hParams.m_invCellSize[1] = m_params.m_invCellSizeY;
+	hParams.m_invCellSize[2] = m_params.m_invCellSizeZ;
+	hParams.m_invCellSize[3] = 0.f;
+	hParams.m_gridSize[0] = m_params.m_gridSizeX;
+	hParams.m_gridSize[1] = m_params.m_gridSizeY;
+	hParams.m_gridSize[2] = m_params.m_gridSizeZ;
+	hParams.m_gridSize[3] = m_params.m_maxBodiesPerCell;
+	copyArrayToDevice(m_dBpParams, &hParams, sizeof(btParamsBpOCL));
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::calcHashAABB()
+{
+	BT_PROFILE("calcHashAABB");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CALC_HASH_AABB, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+
+#else
+	btGpu3DGridBroadphase::calcHashAABB();
+#endif
+	
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::sortHash()
+{
+	BT_PROFILE("sortHash");
+#ifdef CL_PLATFORM_MINI_CL
+	//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+	btGpu3DGridBroadphase::sortHash();
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#else
+	
+//#define USE_HOST
+#ifdef USE_HOST
+	copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+	//adl::Buffer<unsigned int> keysIn,keysOut,valuesIn,valuesOut;
+	///adl::RadixSort32<adl::TYPE_CL>::execute(dataC,keysIn,keysOut,valuesIn,valuesOut,m_numHandles);
+	adl::HostBuffer<adl::SortData> inoutHost;
+	inoutHost.m_device = m_deviceHost;
+	inoutHost.m_ptr = (adl::SortData*)m_hBodiesHash;
+	inoutHost.m_size = m_numHandles;
+	adl::RadixSort<adl::TYPE_HOST>::execute(dataHost, inoutHost,m_numHandles);
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#else
+	{
+	clFinish(m_cqCommandQue);
+	BT_PROFILE("RadixSort32::execute");
+	adl::Buffer<adl::SortData> inout;
+	inout.m_device = this->m_deviceCL;
+	inout.m_size = m_numHandles;
+	inout.m_ptr = (adl::SortData*)m_dBodiesHash;
+	int actualHandles = m_numHandles;
+	int dataAlignment = adl::RadixSort32<adl::TYPE_CL>::DATA_ALIGNMENT;
+
+	if (actualHandles%dataAlignment)
+	{
+		actualHandles += dataAlignment-(actualHandles%dataAlignment);
+	}
+
+	adl::RadixSort32<adl::TYPE_CL>::execute(dataC,inout, actualHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	}
+	{
+		//BT_PROFILE("copyArrayFromDevice");
+	//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	}
+
+
+#endif //USE_HOST
+#endif
+
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::findCellStart()
+{
+#if 1
+	BT_PROFILE("findCellStart");
+		
+	#if defined(CL_PLATFORM_MINI_CL)
+		btGpu3DGridBroadphase::findCellStart();
+		copyArrayToDevice(m_dCellStart, m_hCellStart, m_numCells * sizeof(unsigned int));
+	#else
+			runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CLEAR_CELL_START, m_numCells);	
+			runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_CELL_START, m_numHandles);
+	#endif
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findCellStart();
+#endif
+
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::findOverlappingPairs()
+{
+#if 1
+	BT_PROFILE("findOverlappingPairs");
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findOverlappingPairs();
+	copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); 
+	copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+#endif
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::findPairsLarge()
+{
+	BT_PROFILE("findPairsLarge");
+#if 1
+	if(m_numLargeHandles)
+	{
+		setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 6, sizeof(int),(void*)&m_numLargeHandles);
+		runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, m_numHandles);
+	}
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findPairsLarge();
+#endif
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::computePairCacheChanges()
+{
+	BT_PROFILE("computePairCacheChanges");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+	copyArrayFromDevice( m_hPairScanChanged,m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+
+#else
+	btGpu3DGridBroadphase::computePairCacheChanges();
+	copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+	
+
+#endif
+	return;
+}
+
+
+
+
+extern cl_device_type deviceType;
+
+void bt3dGridBroadphaseOCL::scanOverlappingPairBuff(bool copyToCpu)
+{
+
+	//Intel/CPU version doesn't handlel Adl scan well
+#if 0
+	{
+		copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+		btGpu3DGridBroadphase::scanOverlappingPairBuff();
+		copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+		m_numPrefixSum = m_hPairScanChanged[m_numHandles+1];
+		clFinish(m_cqCommandQue);
+		//memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+	}
+#else
+	{
+
+	//	copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+	//	btGpu3DGridBroadphase::scanOverlappingPairBuff();
+
+		adl::Buffer<unsigned int> destBuffer;
+		
+		{
+			BT_PROFILE("copy GPU->GPU");
+		
+			destBuffer.m_ptr = (unsigned int*)m_dPairScanChanged;
+			destBuffer.m_device = m_deviceCL;
+			destBuffer.m_size =  sizeof(unsigned int)*(m_numHandles+2);
+			m_deviceCL->copy(m_srcClBuffer, &destBuffer,m_numHandles,1,1);
+
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+		}
+
+		{
+			BT_PROFILE("PrefixScan");
+			
+			adl::PrefixScan<adl::TYPE_CL>::execute(gData1,*m_srcClBuffer,destBuffer, m_numHandles+2,&m_numPrefixSum);
+			
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+		//if (m_numPrefixSum>0x1000)
+		//	{
+		//		printf("error m_numPrefixSum==%d\n",m_numPrefixSum);
+		//	}
+
+		}
+
+#if 0
+		unsigned int* verifyhPairScanChanged = new unsigned int[m_maxHandles + 2];
+		memset(verifyhPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+
+		copyArrayFromDevice(verifyhPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
+		clFinish(m_cqCommandQue);
+
+		/*for (int i=0;i<m_numHandles+2;i++)
+		{
+			if (verifyhPairScanChanged[i] != m_hPairScanChanged[i])
+			{
+				printf("hello!\n");
+			}
+		}
+		*/
+
+#endif
+
+
+		if (1)
+		{
+			
+			//the data 
+			if (copyToCpu)
+			{
+				BT_PROFILE("copy GPU -> CPU");
+				copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+			}
+
+		}
+
+	}
+#endif
+
+	
+}
+
+
+
+void bt3dGridBroadphaseOCL::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("btCuda_squeezeOverlappingPairBuff");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, m_numHandles);
+//	btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScanChanged, m_dPairsChanged, m_dAABB, m_numHandles);
+	
+	//copyArrayFromDevice(m_hPairsChanged, m_dPairsChanged, sizeof(unsigned int) * m_numPrefixSum);//m_hPairScanChanged[m_numHandles+1]); //gSum
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::squeezeOverlappingPairBuff();
+#endif
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::resetPool(btDispatcher* dispatcher)
+{
+	btGpu3DGridBroadphase::resetPool(dispatcher);
+	prefillBuffers();
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
@@ -0,0 +1,146 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef BT3DGRIDBROADPHASEOCL_H
+#define BT3DGRIDBROADPHASEOCL_H
+
+#ifdef __APPLE__
+#ifdef USE_MINICL
+	#include <MiniCL/cl.h>
+#else
+	#include <MiniCL/cl.h>
+#endif
+//CL_PLATFORM_MINI_CL could be defined in build system
+#else
+//#include <GL/glew.h>
+// standard utility and system includes
+#ifdef USE_MINICL
+	#include <MiniCL/cl.h>
+#else
+	#include <CL/cl.h>
+#endif
+// Extra CL/GL include
+//#include <CL/cl_gl.h>
+#endif //__APPLE__
+
+namespace adl
+{
+	struct Device;
+	struct DeviceCL;
+};
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+#include "btGpu3DGridBroadphase.h"
+
+
+#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
+
+enum
+{
+	GRID3DOCL_KERNEL_CALC_HASH_AABB = 0,
+	GRID3DOCL_KERNEL_CLEAR_CELL_START,
+	GRID3DOCL_KERNEL_FIND_CELL_START,
+	GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS,
+	GRID3DOCL_KERNEL_FIND_PAIRS_LARGE,
+	GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES,
+	GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF,
+	GRID3DOCL_KERNEL_TOTAL
+};
+
+struct bt3dGridOCLKernelInfo
+{
+	int			m_Id;
+	cl_kernel	m_kernel;
+	char*		m_name;
+	int			m_workgroupSize;
+};
+
+
+///The bt3dGridBroadphaseOCL uses OpenCL-capable GPU to compute overlapping pairs
+
+class bt3dGridBroadphaseOCL : public btGpu3DGridBroadphase
+{
+protected:
+	int						m_hashSize;
+	cl_context				m_cxMainContext;
+	cl_device_id			m_cdDevice;
+	cl_command_queue		m_cqCommandQue;
+	cl_program				m_cpProgram;
+	bt3dGridOCLKernelInfo	m_kernels[GRID3DOCL_KERNEL_TOTAL];
+	// data buffers
+	cl_mem					m_dBodiesHash;
+	cl_mem					m_dCellStart;
+	cl_mem					m_dPairBuff; 
+	cl_mem					m_dPairBuffStartCurr;
+public:
+	cl_mem					m_dAABB;
+protected:
+	cl_mem					m_dPairScanChanged;
+	cl_mem					m_dPairsChanged;
+	cl_mem					m_dPairsContiguous;
+	cl_mem					m_dBpParams;
+
+	adl::Device*			m_deviceHost;
+	adl::DeviceCL*			m_deviceCL;
+	bool					m_ownsDevice;
+
+
+public:
+	unsigned int			m_numPrefixSum;
+
+	bt3dGridBroadphaseOCL(	btOverlappingPairCache* overlappingPairCache,
+							const btVector3& cellSize, 
+							int gridSizeX, int gridSizeY, int gridSizeZ, 
+							int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+							btScalar maxSmallProxySize,
+							int maxSmallProxiesPerCell = 8,
+							cl_context context = NULL,
+							cl_device_id device = NULL,
+							cl_command_queue queue = NULL,
+							adl::DeviceCL* deviceCL = 0
+							);
+	virtual ~bt3dGridBroadphaseOCL();
+
+protected:
+	void initCL(cl_context context, cl_device_id device, cl_command_queue queue);
+	void initKernels();
+	void allocateBuffers();
+	void prefillBuffers();
+	void initKernel(int kernelId, char* pName);
+	void allocateArray(void** devPtr, unsigned int size);
+	void freeArray(void* devPtr);
+	void runKernelWithWorkgroupSize(int kernelId, int globalSize);
+	void setKernelArg(int kernelId, int argNum, int argSize, void* argPtr);
+	void copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs = 0, int hostOffs = 0);
+	void copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs = 0, int devOffs = 0);
+
+// overrides
+	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff(bool copyToCpu=true);
+	virtual void squeezeOverlappingPairBuff();
+	virtual void resetPool(btDispatcher* dispatcher);
+};
+
+#endif //BT3DGRIDBROADPHASEOCL_H
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp
@@ -0,0 +1,626 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///The 3 following lines include the CPU implementation of the kernels, keep them in this order.
+#include "btGpuDefines.h"
+#include "btGpuUtilsSharedDefs.h"
+#include "btGpuUtilsSharedCode.h"
+
+
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+
+
+
+#include "btGpuDefines.h"
+#include "btGpuUtilsSharedDefs.h"
+
+#include "btGpu3DGridBroadphaseSharedDefs.h"
+
+#include "btGpu3DGridBroadphase.h"
+#include <string.h> //for memset
+
+
+#include <stdio.h>
+
+
+
+static bt3DGridBroadphaseParams s3DGridBroadphaseParams;
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	const btVector3& cellSize, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell) :
+	btSimpleBroadphase(maxSmallProxies,
+//				     new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
+				     new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxSmallProxySize, maxBodiesPerCell);
+}
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+										const btVector3& cellSize, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell) :
+	btSimpleBroadphase(maxSmallProxies, overlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxSmallProxySize, maxBodiesPerCell);
+}
+
+
+
+btGpu3DGridBroadphase::~btGpu3DGridBroadphase()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	_finalize();
+
+	
+}
+
+// returns 2^n : 2^(n+1) > val >= 2^n
+int btGpu3DGridBroadphase::getFloorPowOfTwo(int val)
+{
+	int mask = 0x40000000;
+	for(int k = 0; k < 30; k++, mask >>= 1)
+	{
+		if(mask & val)
+		{
+			break;
+		}
+	}
+	return mask;
+}
+
+
+
+void btGpu3DGridBroadphase::_initialize(	const btVector3& cellSize,
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell)
+{
+	// set various paramerers
+	m_ownsPairCache = true;
+	m_params.m_gridSizeX = getFloorPowOfTwo(gridSizeX);
+	m_params.m_gridSizeY = getFloorPowOfTwo(gridSizeY);
+	m_params.m_gridSizeZ = getFloorPowOfTwo(gridSizeZ);
+	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
+	m_numCells = m_params.m_numCells;
+	m_params.m_invCellSizeX = btScalar(1.f) / cellSize[0];
+	m_params.m_invCellSizeY = btScalar(1.f) / cellSize[1];
+	m_params.m_invCellSizeZ = btScalar(1.f) / cellSize[2];
+	m_maxRadius = maxSmallProxySize * btScalar(0.5f);
+	m_params.m_numBodies = m_numBodies;
+	m_params.m_maxBodiesPerCell = maxBodiesPerCell;
+
+	m_numLargeHandles = 0;						
+	m_maxLargeHandles = maxLargeProxies;
+
+	m_maxPairsPerBody = maxPairsPerBody;
+
+	m_LastLargeHandleIndex = -1;
+
+    assert(!m_bInitialized);
+	
+    // allocate host storage
+    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
+    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
+
+    m_hCellStart = new unsigned int[m_params.m_numCells];
+    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
+
+	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
+	// --------------- for now, init with m_maxPairsPerBody for each body
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+	//----------------
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	m_hAABB = new bt3DGrid3F1U[numAABB * 2]; // AABB Min & Max
+
+	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
+
+	m_hPairScanChanged = new unsigned int[m_maxHandles + 2];
+	memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+
+	m_hPairsChanged = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairsChanged,0,sizeof(int)*(m_maxHandles * m_maxPairsPerBody));
+
+	m_hAllOverlappingPairs= new MyUint2[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hAllOverlappingPairs,0,sizeof(MyUint2)*(m_maxHandles * m_maxPairsPerBody));
+
+
+// large proxies
+
+	// allocate handles buffer and put all handles on free list
+	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
+	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
+	m_firstFreeLargeHandle = 0;
+	{
+		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
+		{
+			m_pLargeHandles[i].SetNextFree(i + 1);
+			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
+		}
+		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
+	}
+
+// debug data
+	m_numPairsAdded = 0;
+	m_numOverflows = 0;
+
+	
+    m_bInitialized = true;
+}
+
+
+
+void btGpu3DGridBroadphase::_finalize()
+{
+    assert(m_bInitialized);
+    delete [] m_hBodiesHash;
+    delete [] m_hCellStart;
+    delete [] m_hPairBuffStartCurr;
+    delete [] m_hAABB;
+	delete [] m_hPairBuff;
+	delete [] m_hPairScanChanged;
+	delete [] m_hPairsChanged;
+	delete [] m_hAllOverlappingPairs;
+	btAlignedFree(m_pLargeHandlesRawPtr);
+	m_bInitialized = false;
+}
+
+
+
+void btGpu3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
+{
+	btSimpleBroadphase::calculateOverlappingPairs(dispatcher);
+
+	if(m_numHandles <= 0)
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+		return;
+	}
+	// update constants
+	{
+		BT_PROFILE("setParameters");
+		setParameters(&m_params);
+	}
+
+	// prepare AABB array
+	{
+		BT_PROFILE("prepareAABB");
+		prepareAABB();
+	}
+	// calculate hash
+	{
+		BT_PROFILE("calcHashAABB");
+		calcHashAABB();
+	}
+	{
+		BT_PROFILE("sortHash");
+		// sort bodies based on hash
+		sortHash();
+	}
+	// find start of each cell
+	{
+		BT_PROFILE("findCellStart");
+		findCellStart();
+	}
+	{
+		BT_PROFILE("findOverlappingPairs");
+		// findOverlappingPairs (small/small)
+		findOverlappingPairs();
+	}
+	// findOverlappingPairs (small/large)
+	{
+		BT_PROFILE("findPairsLarge");
+		findPairsLarge();
+	}
+	// add pairs to CPU cache
+	{
+		BT_PROFILE("computePairCacheChanges");
+		computePairCacheChanges();
+	}
+	{
+		BT_PROFILE("scanOverlappingPairBuff");
+		scanOverlappingPairBuff();
+	}
+	{
+		BT_PROFILE("squeezeOverlappingPairBuff");
+		squeezeOverlappingPairBuff();
+	}
+	{
+		BT_PROFILE("addPairsToCache");
+		addPairsToCache(dispatcher);
+	}
+	// find and add large/large pairs to CPU cache
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
+{
+	m_numPairsAdded = 0;
+	m_numPairsRemoved = 0;
+	for(int i = 0; i < m_numHandles; i++) 
+	{
+		unsigned int num = m_hPairScanChanged[i+2] - m_hPairScanChanged[i+1];
+		if(!num)
+		{
+			continue;
+		}
+		unsigned int* pInp = m_hPairsChanged + m_hPairScanChanged[i+1];
+		unsigned int index0 = m_hAABB[i * 2].uw;
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
+		for(unsigned int j = 0; j < num; j++)
+		{
+			unsigned int indx1_s = pInp[j];
+			unsigned int index1 = indx1_s & (~BT_3DGRID_PAIR_ANY_FLG);
+			btSimpleBroadphaseProxy* proxy1;
+			if(index1 < (unsigned int)m_maxHandles)
+			{
+				proxy1 = &m_pHandles[index1];
+			}
+			else
+			{
+				index1 -= m_maxHandles;
+				btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
+				proxy1 = &m_pLargeHandles[index1];
+			}
+			if(indx1_s & BT_3DGRID_PAIR_NEW_FLG)
+			{
+				m_pairCache->addOverlappingPair(proxy0,proxy1);
+				m_numPairsAdded++;
+			}
+			else
+			{
+				m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				m_numPairsRemoved++;
+			}
+		}
+	}
+}
+
+
+
+btBroadphaseProxy* btGpu3DGridBroadphase::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
+{
+	btBroadphaseProxy*  proxy;
+	bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
+	if(bIsLarge)
+	{
+		if (m_numLargeHandles >= m_maxLargeHandles)
+		{
+			///you have to increase the cell size, so 'large' proxies become 'small' proxies (fitting a cell)
+			btAssert(0);
+			return 0; //should never happen, but don't let the game crash ;-)
+		}
+		btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
+		int newHandleIndex = allocLargeHandle();
+		proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
+	}
+	else
+	{
+		proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
+	}
+	return proxy;
+}
+
+
+
+void btGpu3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
+{
+	bool bIsLarge = isLargeProxy(proxy);
+	if(bIsLarge)
+	{
+		
+		btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
+		freeLargeHandle(proxy0);
+		m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
+	}
+	else
+	{
+		btSimpleBroadphase::destroyProxy(proxy, dispatcher);
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::resetPool(btDispatcher* dispatcher)
+{
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax)
+{
+	btVector3 diag = aabbMax - aabbMin;
+	///use the bounding sphere radius of this bounding box, to include rotation
+	btScalar radius = diag.length() * btScalar(0.5f);
+	return (radius > m_maxRadius);
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
+{
+	return (proxy->getUid() >= (m_maxHandles+2));
+}
+
+
+
+void btGpu3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
+{
+	int i,j;
+	if (m_numLargeHandles <= 0)
+	{
+		return;
+	}
+	int new_largest_index = -1;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		new_largest_index = i;
+		for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
+		{
+			btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
+			btAssert(proxy0 != proxy1);
+			btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
+			btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
+			if(aabbOverlap(p0,p1))
+			{
+				if (!m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->addOverlappingPair(proxy0,proxy1);
+				}
+			} 
+			else
+			{
+				if(m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				}
+			}
+		}
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
+{
+	btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
+	for (int i=0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
+		rayCallback.process(proxy);
+	}
+}
+
+
+
+//
+// overrides for CPU version
+//
+
+
+
+void btGpu3DGridBroadphase::prepareAABB()
+{
+	BT_PROFILE("prepareAABB");
+	bt3DGrid3F1U* pBB = m_hAABB;
+	int i;
+	int new_largest_index = -1;
+	unsigned int num_small = 0;
+	for(i = 0; i <= m_LastHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_small;
+		pBB++;
+		num_small++;
+	}
+	m_LastHandleIndex = new_largest_index;
+	new_largest_index = -1;
+	unsigned int num_large = 0;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i + m_maxHandles;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_large + m_maxHandles;
+		pBB++;
+		num_large++;
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	// paranoid checks
+	btAssert(num_small == m_numHandles);
+	btAssert(num_large == m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::setParameters(bt3DGridBroadphaseParams* hostParams)
+{
+	s3DGridBroadphaseParams = *hostParams;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::calcHashAABB()
+{
+	BT_PROFILE("bt3DGrid_calcHashAABB");
+	btGpu_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::sortHash()
+{
+	class bt3DGridHashKey
+	{
+	public:
+	   unsigned int hash;
+	   unsigned int index;
+	   void quickSort(bt3DGridHashKey* pData, int lo, int hi)
+	   {
+			int i=lo, j=hi;
+			bt3DGridHashKey x = pData[(lo+hi)/2];
+			do
+			{    
+				while(pData[i].hash > x.hash) i++; 
+				while(x.hash > pData[j].hash) j--;
+				if(i <= j)
+				{
+					bt3DGridHashKey t = pData[i];
+					pData[i] = pData[j];
+					pData[j] = t;
+					i++; j--;
+				}
+			} while(i <= j);
+			if(lo < j) pData->quickSort(pData, lo, j);
+			if(i < hi) pData->quickSort(pData, i, hi);
+	   }
+	};
+	BT_PROFILE("bt3DGrid_sortHash");
+	bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
+	pHash->quickSort(pHash, 0, m_numHandles - 1);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findCellStart()
+{
+	BT_PROFILE("bt3DGrid_findCellStart");
+	btGpu_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findOverlappingPairs()
+{
+	BT_PROFILE("bt3DGrid_findOverlappingPairs");
+	btGpu_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findPairsLarge()
+{
+	BT_PROFILE("bt3DGrid_findPairsLarge");
+	btGpu_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::computePairCacheChanges()
+{
+	BT_PROFILE("bt3DGrid_computePairCacheChanges");
+	btGpu_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hAABB, m_numHandles);
+	return;
+}
+
+
+void btGpu3DGridBroadphase::scanOverlappingPairBuff(bool copyToCpu)
+{
+	BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
+	unsigned int sum = 0;
+	m_hPairScanChanged[0]=0;
+	for(int i = 0; i <= m_numHandles+1; i++) 
+	{
+		unsigned int delta = m_hPairScanChanged[i];
+		m_hPairScanChanged[i] = sum;
+		sum += delta;
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
+	//btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hPairsChanged, m_hAABB, m_numHandles);
+	btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, (unsigned int*)m_hAllOverlappingPairs, m_hAABB, m_numHandles);
+	
+	return;
+}
+
+
+
+#include "btGpu3DGridBroadphaseSharedCode.h"
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.h
@@ -0,0 +1,154 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASE_H
+#define BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+struct MyUint2
+{
+	int x;
+	int y;
+};
+
+//----------------------------------------------------------------------------------------
+
+///The btGpu3DGridBroadphase uses GPU-style code compiled for CPU to compute overlapping pairs
+
+class btGpu3DGridBroadphase : public btSimpleBroadphase
+{
+protected:
+	bool			m_bInitialized;
+    unsigned int	m_numBodies;
+    unsigned int	m_numCells;
+	unsigned int	m_maxPairsPerBody;
+    unsigned int	m_maxBodiesPerCell;
+	bt3DGridBroadphaseParams m_params;
+	btScalar		m_maxRadius;
+	// CPU data
+    unsigned int*	m_hBodiesHash;
+    unsigned int*	m_hCellStart;
+	unsigned int*	m_hPairBuffStartCurr;
+	bt3DGrid3F1U*	m_hAABB;
+	unsigned int*	m_hPairBuff;
+	unsigned int*	m_hPairScanChanged;
+	unsigned int*	m_hPairsChanged;
+	MyUint2*		m_hAllOverlappingPairs;
+// large proxies
+	int		m_numLargeHandles;						
+	int		m_maxLargeHandles;						
+	int		m_LastLargeHandleIndex;							
+	btSimpleBroadphaseProxy* m_pLargeHandles;
+	void* m_pLargeHandlesRawPtr;
+	int		m_firstFreeLargeHandle;
+	int allocLargeHandle()
+	{
+		btAssert(m_numLargeHandles < m_maxLargeHandles);
+		int freeLargeHandle = m_firstFreeLargeHandle;
+		m_firstFreeLargeHandle = m_pLargeHandles[freeLargeHandle].GetNextFree();
+		m_numLargeHandles++;
+		if(freeLargeHandle > m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex = freeLargeHandle;
+		}
+		return freeLargeHandle;
+	}
+	void freeLargeHandle(btSimpleBroadphaseProxy* proxy)
+	{
+		int handle = int(proxy - m_pLargeHandles);
+		btAssert((handle >= 0) && (handle < m_maxHandles));
+		if(handle == m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex--;
+		}
+		proxy->SetNextFree(m_firstFreeLargeHandle);
+		m_firstFreeLargeHandle = handle;
+		proxy->m_clientObject = 0;
+		m_numLargeHandles--;
+	}
+	bool isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax);
+	bool isLargeProxy(btBroadphaseProxy* proxy);
+// debug
+	unsigned int	m_numPairsAdded;
+	unsigned int	m_numPairsRemoved;
+	unsigned int	m_numOverflows;
+// 
+public:
+	virtual int getNumOverlap()
+	{
+		return m_hPairScanChanged[m_numHandles+1];
+	}
+	virtual MyUint2* getOverlap()
+	{
+		return m_hAllOverlappingPairs;
+	}
+	// NOTE : for better results gridSizeX, gridSizeY and gridSizeZ should be powers of 2 
+	btGpu3DGridBroadphase(const btVector3& cellSize, 
+					   int gridSizeX, int gridSizeY, int gridSizeZ, 
+					   int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+					   int maxBodiesPerCell = 8);
+	btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+						const btVector3& cellSize, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+						int maxBodiesPerCell = 8);
+	virtual ~btGpu3DGridBroadphase();
+	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
+
+	virtual btBroadphaseProxy*	createProxy(const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
+	virtual void	destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
+	virtual void	rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
+	virtual void	resetPool(btDispatcher* dispatcher);
+
+	static int		getFloorPowOfTwo(int val); // returns 2^n : 2^(n+1) > val >= 2^n
+
+protected:
+	void _initialize(	const btVector3& cellSize, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+						int maxBodiesPerCell);
+	void _finalize();
+	void addPairsToCache(btDispatcher* dispatcher);
+	void addLarge2LargePairsToCache(btDispatcher* dispatcher);
+
+// overrides for CPU version
+	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff(bool copyToCpu=true);
+	virtual void squeezeOverlappingPairBuff();
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif //BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedCode.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedCode.h
@@ -0,0 +1,428 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// calculate position in uniform grid
+BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
+{
+    int3 gridPos;
+    gridPos.x = (int)floor(p.x * BT_GPU_params.m_invCellSizeX) & (BT_GPU_params.m_gridSizeX - 1);
+    gridPos.y = (int)floor(p.y * BT_GPU_params.m_invCellSizeY) & (BT_GPU_params.m_gridSizeY - 1);
+    gridPos.z = (int)floor(p.z * BT_GPU_params.m_invCellSizeZ) & (BT_GPU_params.m_gridSizeZ - 1);
+    return gridPos;
+} // bt3DGrid_calcGridPos()
+
+//----------------------------------------------------------------------------------------
+
+// calculate address in grid from position (clamping to edges)
+BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
+{
+	gridPos.x &= (BT_GPU_params.m_gridSizeX - 1);
+	gridPos.y &= (BT_GPU_params.m_gridSizeY - 1);
+	gridPos.z &= (BT_GPU_params.m_gridSizeZ - 1);
+    return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
+} // bt3DGrid_calcGridHash()
+
+//----------------------------------------------------------------------------------------
+
+// calculate grid hash value for each body using its AABB
+BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index*2];
+	bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // store grid hash and body index
+    pHash[index] = BT_GPU_make_uint2(gridHash, index);
+} // calcHashAABBD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	// Load hash data into shared memory so that we can look 
+	// at neighboring body's hash value without loading
+	// two hash values per thread
+	BT_GPU___shared__ uint sharedHash[257];
+	sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
+	if((index > 0) && (BT_GPU_threadIdx.x == 0))
+	{
+		// first thread in block must load neighbor body hash
+		volatile uint2 prevData = pHash[index-1];
+		sharedHash[0] = prevData.x;
+	}
+	BT_GPU___syncthreads();
+	if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
+	{
+		cellStart[sortedData.x] = index;
+	}
+} // findCellStartD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
+{
+	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
+			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
+			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
+} // cudaTestAABBOverlap()
+ 
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ void findPairsInCell(	int3	gridPos,
+										uint    index,
+										uint2*  pHash,
+										uint*   pCellStart,
+										bt3DGrid3F1U* pAABB, 
+										uint*   pPairBuff,
+										uint2*	pPairBuffStartCurr,
+										uint	numBodies)
+{
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // get start of bucket for this cell
+    uint bucketStart = pCellStart[gridHash];
+    if (bucketStart == 0xffffffff)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+    bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2); 
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+	uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
+	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        uint2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		uint unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
+			bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
+			if(cudaTestAABBOverlap(min0, max0, min1, max1))
+			{
+				uint handleIndex2 = min1.uw;
+				uint k;
+				for(k = 0; k < curr; k++)
+				{
+					uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+					curr++;
+				}
+			}
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsInCell()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findOverlappingPairsD(	bt3DGrid3F1U*	pAABB, uint2* pHash, uint* pCellStart, 
+												uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) {
+        for(int y=-1; y<=1; y++) {
+            for(int x=-1; x<=1; x++) {
+                findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
+            }
+        }
+    }
+} // findOverlappingPairsD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findPairsLargeD(	bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
+										uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		uint indx2 = numBodies + i;
+		bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
+		bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
+		if(cudaTestAABBOverlap(min0, max0, min1, max1))
+		{
+			uint k;
+			uint handleIndex2 =  min1.uw;
+			for(k = 0; k < curr; k++)
+			{
+				uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsLargeD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, 
+												uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint *pInp = pPairBuff + start;
+	uint num_changes = 0;
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		if(((*pInp) & BT_3DGRID_PAIR_ANY_FLG))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} // computePairCacheChangesD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
+												   uint2* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint* pInp = pPairBuff + start;
+	uint2* pOut = pPairOut + pPairScan[index+1];
+	uint* pOut2 = pInp;
+	uint num = 0; 
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
+		//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		{
+			pOut->x = handleIndex;
+			pOut->y = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
+
+			pOut++;
+		}
+		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
+		{
+			*pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
+			pOut2++;
+			num++;
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
+} // squeezeOverlappingPairBuffD()
+
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               E N D   O F    K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    // execute the kernel
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
+    // check if kernel invocation generated an error
+    BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
+} // calcHashAABB()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+	BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
+	BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
+} // findCellStart()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findOverlappingPairs()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findPairsLarge()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
+} // computePairCacheChanges()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan,  unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint2*)pPairOut,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
+} // btCuda_squeezeOverlappingPairBuff()
+
+//------------------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedDefs.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedDefs.h
@@ -0,0 +1,61 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+#define BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies);
+
+void BT_GPU_PREF(findCellStart)(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
+
+void BT_GPU_PREF(findOverlappingPairs)(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies);
+
+void BT_GPU_PREF(findPairsLarge)(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge);
+
+void BT_GPU_PREF(computePairCacheChanges)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedTypes.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedTypes.h
@@ -0,0 +1,64 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+#define BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+
+//----------------------------------------------------------------------------------------
+
+#define BT_3DGRID_PAIR_FOUND_FLG (0x40000000)
+#define BT_3DGRID_PAIR_NEW_FLG   (0x20000000)
+#define BT_3DGRID_PAIR_ANY_FLG   (BT_3DGRID_PAIR_FOUND_FLG | BT_3DGRID_PAIR_NEW_FLG)
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGridBroadphaseParams 
+{
+	unsigned int	m_gridSizeX;
+	unsigned int	m_gridSizeY;
+	unsigned int	m_gridSizeZ;
+	unsigned int	m_numCells;
+	float			m_invCellSizeX;
+	float			m_invCellSizeY;
+	float			m_invCellSizeZ;
+	unsigned int	m_numBodies;
+	unsigned int	m_maxBodiesPerCell;
+};
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGrid3F1U
+{
+	float			fx;
+	float			fy;
+	float			fz;
+	unsigned int	uw;
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuDefines.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuDefines.h
@@ -0,0 +1,211 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+// definitions for "GPU on CPU" code
+
+
+#ifndef BT_GPU_DEFINES_H
+#define BT_GPU_DEFINES_H
+
+typedef unsigned int uint;
+
+struct int2
+{
+	int x, y;
+};
+
+struct uint2
+{
+	unsigned int x, y;
+};
+
+struct int3
+{
+	int x, y, z;
+};
+
+struct uint3
+{
+	unsigned int x, y, z;
+};
+
+struct float4
+{
+	float x, y, z, w;
+};
+
+struct float3
+{
+	float x, y, z;
+};
+
+
+#define BT_GPU___device__ inline
+#define BT_GPU___devdata__
+#define BT_GPU___constant__
+#define BT_GPU_max(a, b) ((a) > (b) ? (a) : (b))
+#define BT_GPU_min(a, b) ((a) < (b) ? (a) : (b))
+#define BT_GPU_params s3DGridBroadphaseParams
+#define BT_GPU___mul24(a, b) ((a)*(b))
+#define BT_GPU___global__ inline
+#define BT_GPU___shared__ static
+#define BT_GPU___syncthreads()
+#define CUDART_PI_F SIMD_PI
+
+static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+#define BT_GPU_make_uint2(x, y) bt3dGrid_make_uint2(x, y)
+
+static inline int3 bt3dGrid_make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
+
+static inline float3 bt3dGrid_make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_float3(x, y, z) bt3dGrid_make_float3(x, y, z)
+
+static inline float3 bt3dGrid_make_float34(float4 f)
+{
+  float3 t; t.x = f.x; t.y = f.y; t.z = f.z; return t;
+}
+#define BT_GPU_make_float34(f) bt3dGrid_make_float34(f)
+
+static inline float3 bt3dGrid_make_float31(float f)
+{
+  float3 t; t.x = t.y = t.z = f; return t;
+}
+#define BT_GPU_make_float31(x) bt3dGrid_make_float31(x)
+
+static inline float4 bt3dGrid_make_float42(float3 v, float f)
+{
+  float4 t; t.x = v.x; t.y = v.y; t.z = v.z; t.w = f; return t;
+}
+#define BT_GPU_make_float42(a, b) bt3dGrid_make_float42(a, b) 
+
+static inline float4 bt3dGrid_make_float44(float a, float b, float c, float d)
+{
+  float4 t; t.x = a; t.y = b; t.z = c; t.w = d; return t;
+}
+#define BT_GPU_make_float44(a, b, c, d) bt3dGrid_make_float44(a, b, c, d) 
+
+inline int3 operator+(int3 a, int3 b)
+{
+    return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+inline float4 operator+(const float4& a, const float4& b)
+{
+	float4 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; r.w = a.w+b.w; return r;
+}
+inline float4 operator*(const float4& a, float fact)
+{
+	float4 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; r.w = a.w*fact; return r;
+}
+inline float4 operator*(float fact, float4& a)
+{
+	return (a * fact);
+}
+inline float4& operator*=(float4& a, float fact)
+{
+	a = fact * a;
+	return a;
+}
+inline float4& operator+=(float4& a, const float4& b)
+{
+	a = a + b;
+	return a;
+}
+
+inline float3 operator+(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; return r;
+}
+inline float3 operator-(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x-b.x; r.y = a.y-b.y; r.z = a.z-b.z; return r;
+}
+static inline float bt3dGrid_dot(float3& a, float3& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+#define BT_GPU_dot(a,b) bt3dGrid_dot(a,b)
+
+static inline float bt3dGrid_dot4(float4& a, float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+#define BT_GPU_dot4(a,b) bt3dGrid_dot4(a,b)
+
+static inline float3 bt3dGrid_cross(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.y*b.z-a.z*b.y; r.y = -a.x*b.z+a.z*b.x; r.z = a.x*b.y-a.y*b.x;	return r;
+}
+#define BT_GPU_cross(a,b) bt3dGrid_cross(a,b)
+
+
+inline float3 operator*(const float3& a, float fact)
+{
+	float3 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; return r;
+}
+
+
+inline float3& operator+=(float3& a, const float3& b)
+{
+	a = a + b;
+	return a;
+}
+inline float3& operator-=(float3& a, const float3& b)
+{
+	a = a - b;
+	return a;
+}
+inline float3& operator*=(float3& a, float fact)
+{
+	a = a * fact;
+	return a;
+}
+inline float3 operator-(const float3& v)
+{
+	float3 r; r.x = -v.x; r.y = -v.y; r.z = -v.z; return r;
+}
+
+
+#define BT_GPU_FETCH(a, b) a[b]
+#define BT_GPU_FETCH4(a, b) a[b]
+#define BT_GPU_PREF(func) btGpu_##func
+#define BT_GPU_SAFE_CALL(func) func
+#define BT_GPU_Memset memset
+#define BT_GPU_MemcpyToSymbol(a, b, c) memcpy(&a, b, c)
+#define BT_GPU_BindTexture(a, b, c, d)
+#define BT_GPU_UnbindTexture(a)
+
+static uint2 s_blockIdx, s_blockDim, s_threadIdx;
+#define BT_GPU_blockIdx s_blockIdx
+#define BT_GPU_blockDim s_blockDim
+#define BT_GPU_threadIdx s_threadIdx
+#define BT_GPU_EXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
+
+#define BT_GPU_CHECK_ERROR(s)
+
+
+#endif //BT_GPU_DEFINES_H
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedCode.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedCode.h
@@ -0,0 +1,55 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared code for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  will be compiled by both CPU and CUDA compilers
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpuUtilsSharedDefs.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+} // iDivUp()
+
+//----------------------------------------------------------------------------------------
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
+{
+    numThreads = BT_GPU_min(blockSize, n);
+    numBlocks = BT_GPU_PREF(iDivUp)(n, numThreads);
+} // computeGridSize()
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedDefs.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedDefs.h
@@ -0,0 +1,52 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+// Shared definitions for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+#ifndef BTGPUUTILSDHAREDDEFS_H
+#define BTGPUUTILSDHAREDDEFS_H
+
+
+extern "C"
+{
+
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b);
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads);
+
+void BT_GPU_PREF(allocateArray)(void** devPtr, unsigned int size);
+void BT_GPU_PREF(freeArray)(void* devPtr);
+void BT_GPU_PREF(copyArrayFromDevice)(void* host, const void* device, unsigned int size);
+void BT_GPU_PREF(copyArrayToDevice)(void* device, const void* host, unsigned int size);
+void BT_GPU_PREF(registerGLBufferObject(unsigned int vbo));
+void* BT_GPU_PREF(mapGLBufferObject(unsigned int vbo));
+void BT_GPU_PREF(unmapGLBufferObject(unsigned int vbo));
+
+
+} // extern "C"
+
+
+#endif // BTGPUUTILSDHAREDDEFS_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/premake4.lua
@@ -0,0 +1,5 @@
+
+	include "AMD"
+--	include "Intel"
+--	include "NVIDIA"
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/AMD/premake4.lua
@@ -0,0 +1,23 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_intialize_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+--		includedirs {"..","../../../../include/gpu_research"}
+		
+		files {
+			"../main.cpp",
+			"../btOpenCLUtils.cpp",
+			"../btOpenCLUtils.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/Intel/premake4.lua
@@ -0,0 +1,23 @@
+	
+	hasCL = findOpenCL_Intel()
+	
+	if (hasCL) then
+
+		project "OpenCL_intialize_Intel"
+
+		initOpenCL_Intel()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+--		includedirs {"..","../../../../include/gpu_research"}
+		
+		files {
+			"../main.cpp",
+			"../btOpenCLUtils.cpp",
+			"../btOpenCLUtils.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/NVIDIA/premake4.lua
@@ -0,0 +1,23 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_intialize_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+--		includedirs {"..","../../../../include/gpu_research"}
+		
+		files {
+			"../main.cpp",
+			"../btOpenCLUtils.cpp",
+			"../btOpenCLUtils.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLInclude.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLInclude.h
@@ -0,0 +1,43 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_OPENCL_INCLUDE_H
+#define BT_OPENCL_INCLUDE_H
+
+
+#ifdef __APPLE__
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#else
+#include <OpenCL/cl.h>
+#endif
+#else
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#else
+#include <CL/cl.h>
+#ifdef _WIN32
+#include "CL/cl_gl.h"
+#endif //_WIN32
+#endif
+#endif //__APPLE__
+
+#include <assert.h>
+#include <stdio.h>
+#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
+
+
+#endif //BT_OPENCL_INCLUDE_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.cpp
@@ -0,0 +1,731 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//original author: Roman Ponomarev
+//cleanup by Erwin Coumans
+
+#include <string.h>
+
+#include "btOpenCLUtils.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define BT_MAX_CL_DEVICES 16 //who needs 16 devices?
+
+#ifdef _WIN32
+#include <Windows.h>
+#include <assert.h>
+
+#define btAssert assert
+#endif
+
+//Set the preferred platform vendor using the OpenCL SDK
+static char* spPlatformVendor = 
+#if defined(CL_PLATFORM_MINI_CL)
+"MiniCL, SCEA";
+#elif defined(CL_PLATFORM_AMD)
+"Advanced Micro Devices, Inc.";
+#elif defined(CL_PLATFORM_NVIDIA)
+"NVIDIA Corporation";
+#elif defined(CL_PLATFORM_INTEL)
+"Intel(R) Corporation";
+#else
+"Unknown Vendor";
+#endif
+
+#ifndef CL_PLATFORM_MINI_CL
+#ifdef _WIN32
+#include "CL/cl_gl.h"
+#endif //_WIN32
+#endif
+
+int btOpenCLUtils::getNumPlatforms(cl_int* pErrNum)
+{
+	cl_uint numPlatforms=0;
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL) 
+			*pErrNum = ciErrNum;
+	}
+	return numPlatforms;
+}
+
+const char* btOpenCLUtils::getSdkVendorName()
+{
+	return spPlatformVendor;
+}
+
+cl_platform_id btOpenCLUtils::getPlatform(int platformIndex, cl_int* pErrNum)
+{
+	cl_platform_id platform = 0;
+
+	cl_uint numPlatforms;
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	
+	if (platformIndex>=0 && platformIndex<numPlatforms)
+	{
+		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL) 
+				*pErrNum = ciErrNum;
+			return platform;
+		}
+
+		platform = platforms[platformIndex];
+
+		delete[] platforms;
+	}
+
+	return platform;
+}
+
+void btOpenCLUtils::getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo& platformInfo)
+{
+	cl_int ciErrNum;
+
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VENDOR,BT_MAX_STRING_LENGTH,platformInfo.m_platformVendor,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_NAME,BT_MAX_STRING_LENGTH,platformInfo.m_platformName,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VERSION,BT_MAX_STRING_LENGTH,platformInfo.m_platformVersion,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+}
+
+cl_context btOpenCLUtils::createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	cl_context retContext = 0;
+	cl_int ciErrNum=0;
+
+	/*     
+	* If we could find our platform, use it. Otherwise pass a NULL and get whatever the     
+	* implementation thinks we should be using.     
+	*/
+	cl_context_properties cps[7] = {0,0,0,0,0,0,0};
+	cps[0] = CL_CONTEXT_PLATFORM;
+	cps[1] = (cl_context_properties)platform;
+	if (pGLContext && pGLDC)
+	{
+		cps[2] = CL_GL_CONTEXT_KHR;
+		cps[3] = (cl_context_properties)pGLContext;
+		cps[4] = CL_WGL_HDC_KHR;
+		cps[5] = (cl_context_properties)pGLDC;
+	}
+
+	cl_uint num_entries = BT_MAX_CL_DEVICES;
+ 	cl_device_id devices[BT_MAX_CL_DEVICES];
+
+	cl_uint num_devices=-1;
+
+	ciErrNum = clGetDeviceIDs(	
+		platform,
+		deviceType,
+ 		num_entries,
+ 		devices,
+ 		&num_devices);
+
+	cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
+
+	if (pGLContext)
+	{
+		//search for the GPU that relates to the OpenCL context
+		for (int i=0;i<num_devices;i++)
+		{
+			retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
+			if (ciErrNum==CL_SUCCESS)
+				break;
+		}
+	}
+	else
+	{
+		if (preferredDeviceIndex>=0 && preferredDeviceIndex<num_devices)
+		{
+			//create a context of the preferred device index
+			retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
+		} else
+		{
+			//create a context of all devices
+			retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
+		}
+	}
+	if(pErrNum != NULL) 
+	{
+		*pErrNum = ciErrNum;
+	};
+
+	return retContext;
+}
+
+cl_context btOpenCLUtils::createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	cl_uint numPlatforms;
+	cl_context retContext = 0;
+	
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL) *pErrNum = ciErrNum;
+		return NULL;
+	}
+	if(numPlatforms > 0)     
+	{        
+		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL) *pErrNum = ciErrNum;
+			return NULL;
+		}
+		int i;
+
+
+		for ( i = 0; i < numPlatforms; ++i)         
+		{            
+			char pbuf[128];            
+			ciErrNum = clGetPlatformInfo(	platforms[i],
+				CL_PLATFORM_VENDOR,                                       
+				sizeof(pbuf),                                       
+				pbuf,                                       
+				NULL);
+			if(ciErrNum != CL_SUCCESS)
+			{
+				if(pErrNum != NULL) *pErrNum = ciErrNum;
+				return NULL;
+			}
+
+			if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
+			{
+				cl_platform_id tmpPlatform = platforms[0];
+				platforms[0] = platforms[i];
+				platforms[i] = tmpPlatform;
+				break;
+			} else
+			{
+				if(!strcmp(pbuf, spPlatformVendor))
+				{
+					cl_platform_id tmpPlatform = platforms[0];
+					platforms[0] = platforms[i];
+					platforms[i] = tmpPlatform;
+					break;
+				}
+			}
+		}
+
+		for (i = 0; i < numPlatforms; ++i)         
+		{
+			cl_platform_id platform = platforms[i];
+			assert(platform);
+
+			retContext = btOpenCLUtils::createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex);
+
+			if (retContext)
+			{
+//				printf("OpenCL platform details:\n");
+				btOpenCLPlatformInfo platformInfo;
+
+				btOpenCLUtils::getPlatformInfo(platform, platformInfo);
+
+				printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+				printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+				printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+
+				break;
+			}
+		}
+
+		delete[] platforms;    
+	}
+	return retContext;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxMainContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+cl_device_id btOpenCLUtils::getDevice(cl_context cxMainContext, int deviceIndex)
+{
+	size_t szParmDataBytes;
+	cl_device_id* cdDevices;
+
+	// get the list of devices associated with context
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
+
+	if( szParmDataBytes / sizeof(cl_device_id) < deviceIndex ) {
+		return (cl_device_id)-1;
+	}
+
+	cdDevices = (cl_device_id*) malloc(szParmDataBytes);
+
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
+
+	cl_device_id device = cdDevices[deviceIndex];
+	free(cdDevices);
+
+	return device;
+}
+
+int btOpenCLUtils::getNumDevices(cl_context cxMainContext)
+{
+	size_t szParamDataBytes;
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
+	int device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
+	return device_count;
+}
+
+void btOpenCLUtils::printDeviceInfo(cl_device_id device)
+{
+	btOpenCLDeviceInfo info;
+	getDeviceInfo(device,info);
+
+	printf("  CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
+	printf("  CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
+	printf("  CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
+
+	if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
+	if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
+
+	printf("  CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
+	printf("  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
+	printf("  CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
+	printf("  CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
+	printf("  CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
+	printf("  CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
+	printf("  CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
+	printf("  CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
+	printf("  CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
+	printf("  CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
+	printf("  CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
+	printf("  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
+	if( info.m_queueProperties  & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
+		printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");    
+	if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
+		printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
+
+	printf("  CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
+
+	printf("  CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
+	printf("  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
+	printf("\n  CL_DEVICE_IMAGE <dim>"); 
+	printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
+	printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
+	printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
+	printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
+	printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
+	if (info.m_deviceExtensions != 0) 
+		printf("\n  CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
+	else 
+		printf("  CL_DEVICE_EXTENSIONS: None\n");
+	printf("  CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t"); 
+	printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n", 
+		info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble); 
+
+
+}
+
+void btOpenCLUtils::getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo& info)
+{
+
+	// CL_DEVICE_NAME
+	clGetDeviceInfo(device, CL_DEVICE_NAME, BT_MAX_STRING_LENGTH, &info.m_deviceName, NULL);
+
+	// CL_DEVICE_VENDOR
+	clGetDeviceInfo(device, CL_DEVICE_VENDOR, BT_MAX_STRING_LENGTH, &info.m_deviceVendor, NULL);
+
+	// CL_DRIVER_VERSION
+	clGetDeviceInfo(device, CL_DRIVER_VERSION, BT_MAX_STRING_LENGTH, &info.m_driverVersion, NULL);
+
+	// CL_DEVICE_INFO
+	clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info.m_deviceType, NULL);
+
+	// CL_DEVICE_MAX_COMPUTE_UNITS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info.m_computeUnits), &info.m_computeUnits, NULL);
+
+	// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info.m_workitemDims), &info.m_workitemDims, NULL);
+
+	// CL_DEVICE_MAX_WORK_ITEM_SIZES
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info.m_workItemSize), &info.m_workItemSize, NULL);
+
+	// CL_DEVICE_MAX_WORK_GROUP_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info.m_workgroupSize), &info.m_workgroupSize, NULL);
+
+	// CL_DEVICE_MAX_CLOCK_FREQUENCY
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info.m_clockFrequency), &info.m_clockFrequency, NULL);
+
+	// CL_DEVICE_ADDRESS_BITS
+	clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info.m_addressBits), &info.m_addressBits, NULL);
+
+	// CL_DEVICE_MAX_MEM_ALLOC_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info.m_maxMemAllocSize), &info.m_maxMemAllocSize, NULL);
+
+	// CL_DEVICE_GLOBAL_MEM_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info.m_globalMemSize), &info.m_globalMemSize, NULL);
+
+	// CL_DEVICE_ERROR_CORRECTION_SUPPORT
+	clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info.m_errorCorrectionSupport), &info.m_errorCorrectionSupport, NULL);
+
+	// CL_DEVICE_LOCAL_MEM_TYPE
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info.m_localMemType), &info.m_localMemType, NULL);
+
+	// CL_DEVICE_LOCAL_MEM_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info.m_localMemSize), &info.m_localMemSize, NULL);
+
+	// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info.m_constantBufferSize), &info.m_constantBufferSize, NULL);
+
+	// CL_DEVICE_QUEUE_PROPERTIES
+	clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info.m_queueProperties), &info.m_queueProperties, NULL);
+
+	// CL_DEVICE_IMAGE_SUPPORT
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info.m_imageSupport), &info.m_imageSupport, NULL);
+
+	// CL_DEVICE_MAX_READ_IMAGE_ARGS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info.m_maxReadImageArgs), &info.m_maxReadImageArgs, NULL);
+
+	// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info.m_maxWriteImageArgs), &info.m_maxWriteImageArgs, NULL);
+
+	// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info.m_image2dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info.m_image2dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info.m_image3dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info.m_image3dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info.m_image3dMaxDepth, NULL);
+
+	// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
+	clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, BT_MAX_STRING_LENGTH, &info.m_deviceExtensions, NULL);
+
+	// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info.m_vecWidthChar, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info.m_vecWidthShort, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info.m_vecWidthInt, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info.m_vecWidthLong, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info.m_vecWidthFloat, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info.m_vecWidthDouble, NULL);
+}
+
+static const char* strip2(const char* name, const char* pattern)
+{
+	  size_t const patlen = strlen(pattern);
+  	size_t patcnt = 0;
+	  const char * oriptr;
+	  const char * patloc;
+		// find how many times the pattern occurs in the original string
+	  for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+	  {
+		patcnt++;
+	  }
+	  return oriptr;
+}
+
+cl_program btOpenCLUtils::compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros , const char* clFileNameForCaching)
+{
+
+	cl_program m_cpProgram=0;
+	cl_int status;
+
+	char binaryFileName[522];
+
+	if (clFileNameForCaching)
+	{
+		
+		char deviceName[256];
+		char driverVersion[256];
+		clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
+		clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+
+		
+		const char* strippedName = strip2(clFileNameForCaching,"\\");
+		strippedName = strip2(strippedName,"/");
+
+		sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
+		//printf("searching for %s\n", binaryFileName);
+
+		bool fileUpToDate = false;
+		bool binaryFileValid=false;
+
+		FILETIME modtimeBinary; 
+
+#ifdef _WIN32
+		CreateDirectory("cache",0);
+		{
+			
+			HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+			if (binaryFileHandle ==INVALID_HANDLE_VALUE)
+			{
+				DWORD errorCode;
+				errorCode = GetLastError();
+				switch (errorCode)
+				{
+				case ERROR_FILE_NOT_FOUND:
+					{
+						printf("\nCached file not found %s\n", binaryFileName);
+						break;
+					}
+				case ERROR_PATH_NOT_FOUND:
+					{
+						printf("\nCached file path not found %s\n", binaryFileName);
+						break;
+					}
+				default:
+					{
+						printf("\nFailed reading cached file with errorCode = %d\n", errorCode);
+					}
+				}
+			} else
+			{
+				if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
+				{
+					DWORD errorCode;
+					errorCode = GetLastError();
+					printf("\nGetFileTime errorCode = %d\n", errorCode);
+				} else
+				{
+					binaryFileValid = true;
+				}
+				CloseHandle(binaryFileHandle);
+			}
+
+			if (binaryFileValid)
+			{
+				HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+				if (srcFileHandle!=INVALID_HANDLE_VALUE)
+				{
+					FILETIME modtimeSrc; 
+					if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
+					{
+						DWORD errorCode;
+						errorCode = GetLastError();
+						printf("\nGetFileTime errorCode = %d\n", errorCode);
+					}
+					if (  ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
+						||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
+					{
+						fileUpToDate=true;
+					} else
+					{
+						printf("\nCached binary file out-of-date (%s)\n",binaryFileName);
+					}
+					CloseHandle(srcFileHandle);
+				} 
+				else
+				{
+#ifdef _DEBUG
+					DWORD errorCode;
+					errorCode = GetLastError();
+					switch (errorCode)
+					{
+					case ERROR_FILE_NOT_FOUND:
+						{
+							printf("\nSrc file not found %s\n", clFileNameForCaching);
+							break;
+						}
+					case ERROR_PATH_NOT_FOUND:
+						{
+							printf("\nSrc path not found %s\n", clFileNameForCaching);
+							break;
+						}
+					default:
+						{
+							printf("\nnSrc file reading errorCode = %d\n", errorCode);
+						}
+					}
+
+					//we should make sure the src file exists so we can verify the timestamp with binary
+					assert(0);
+#else
+					//if we cannot find the source, assume it is OK in release builds
+					fileUpToDate = true;
+#endif
+				}
+			}
+			
+
+		}
+
+		if( fileUpToDate)
+		{
+			FILE* file = fopen(binaryFileName, "rb");
+			if (file)
+			{
+				fseek( file, 0L, SEEK_END );
+				size_t binarySize = ftell( file );
+				rewind( file );
+				char* binary = new char[binarySize];
+				fread( binary, sizeof(char), binarySize, file );
+				fclose( file );
+
+				m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
+				btAssert( status == CL_SUCCESS );
+				status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				if( status != CL_SUCCESS )
+				{
+					char *build_log;
+					size_t ret_val_size;
+					clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+					build_log = new char[ret_val_size+1];
+					clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+					build_log[ret_val_size] = '\0';
+					printf("%s\n", build_log);
+					delete build_log;
+					btAssert(0);
+					m_cpProgram = 0;
+				}
+				delete[] binary;
+			}
+		}
+#endif //_WIN32
+		
+	}
+	
+	if (!m_cpProgram)
+	{
+		cl_kernel kernel;
+		cl_int localErrNum;
+		size_t program_length = strlen(kernelSource);
+
+		m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+
+		// Build the program with 'mad' Optimization option
+
+
+	#ifdef MAC
+		char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
+	#else
+		//const char* flags = "-DGUID_ARG= -fno-alias";
+		const char* flags = "-DGUID_ARG= ";
+	#endif
+
+		char* compileFlags = new char[strlen(additionalMacros) + strlen(flags) + 5];
+		sprintf(compileFlags, "%s %s", flags, additionalMacros);
+		localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			char *build_log;
+			size_t ret_val_size;
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+			build_log = new char[ret_val_size+1];
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+			// to be carefully, terminate with \0
+			// there's no information in the reference whether the string is 0 terminated or not
+			build_log[ret_val_size] = '\0';
+
+
+			printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
+			delete[] build_log;
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+
+		if( clFileNameForCaching )
+		{	//	write to binary
+
+			cl_uint numAssociatedDevices;
+			status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
+			btAssert( status == CL_SUCCESS );
+			if (numAssociatedDevices==1)
+			{
+
+				size_t binarySize;
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				char* binary = new char[binarySize];
+
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				{
+					FILE* file = fopen(binaryFileName, "wb");
+					if (file)
+					{
+						fwrite( binary, sizeof(char), binarySize, file );
+						fclose( file );
+					} else
+					{
+						printf("cannot write file %s\n", binaryFileName);
+					}
+				}
+
+				delete [] binary;
+			}
+		}
+		delete [] compileFlags;
+	}
+
+	return m_cpProgram;
+}
+
+
+cl_kernel btOpenCLUtils::compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
+{
+	printf("compiling kernel %s ",kernelName);
+	cl_kernel kernel;
+	cl_int localErrNum;
+	size_t program_length = strlen(kernelSource);
+
+
+	cl_program m_cpProgram = prog;
+	if (!m_cpProgram)
+	{
+		m_cpProgram = compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros);
+	}
+
+
+	// Create the kernel
+	kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
+	if (localErrNum != CL_SUCCESS)
+	{
+		printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
+		if (pErrNum)
+			*pErrNum = localErrNum;
+		return 0;
+	}
+
+	if (!prog && m_cpProgram)
+	{
+		clReleaseProgram(m_cpProgram);
+	}
+	printf("ready. \n");
+
+
+	if (pErrNum)
+			*pErrNum = CL_SUCCESS;
+	return kernel;
+
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.h
@@ -0,0 +1,104 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//original author: Roman Ponomarev
+//cleanup by Erwin Coumans
+
+#ifndef BT_OPENCL_UTILS_H
+#define BT_OPENCL_UTILS_H
+
+#include "btOpenCLInclude.h"
+
+
+#define BT_MAX_STRING_LENGTH 1024
+
+struct btOpenCLDeviceInfo
+{
+	char m_deviceName[BT_MAX_STRING_LENGTH];
+	char m_deviceVendor[BT_MAX_STRING_LENGTH];
+	char m_driverVersion[BT_MAX_STRING_LENGTH];
+	char m_deviceExtensions[BT_MAX_STRING_LENGTH];
+
+	cl_device_type		m_deviceType;
+	cl_uint 				m_computeUnits;
+	size_t 					m_workitemDims;
+	size_t 					m_workItemSize[3];
+	size_t 					m_image2dMaxWidth;
+	size_t 					m_image2dMaxHeight;
+	size_t 					m_image3dMaxWidth;
+	size_t 					m_image3dMaxHeight;
+	size_t 					m_image3dMaxDepth;
+	size_t 					m_workgroupSize;
+	cl_uint 				m_clockFrequency;
+	cl_ulong				m_constantBufferSize;
+	cl_ulong				m_localMemSize;
+	cl_ulong				m_globalMemSize;
+    cl_bool					m_errorCorrectionSupport;
+	cl_device_local_mem_type m_localMemType;
+	cl_uint					m_maxReadImageArgs;
+	cl_uint					m_maxWriteImageArgs;
+
+
+
+	cl_uint 				m_addressBits;
+	cl_ulong				m_maxMemAllocSize;
+	cl_command_queue_properties m_queueProperties;
+	cl_bool					m_imageSupport;
+	cl_uint					m_vecWidthChar;
+	cl_uint					m_vecWidthShort;
+	cl_uint					m_vecWidthInt;
+	cl_uint					m_vecWidthLong;
+	cl_uint					m_vecWidthFloat;
+	cl_uint					m_vecWidthDouble;
+
+};
+
+struct btOpenCLPlatformInfo
+{
+	char m_platformVendor[BT_MAX_STRING_LENGTH];
+	char m_platformName[BT_MAX_STRING_LENGTH];
+	char m_platformVersion[BT_MAX_STRING_LENGTH];
+};
+
+class btOpenCLUtils
+{
+public:
+
+	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
+	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
+	static cl_context 	createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1);
+	
+	static int getNumDevices(cl_context cxMainContext);
+	static cl_device_id getDevice(cl_context cxMainContext, int nr);
+	static void getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo& info);
+	static void printDeviceInfo(cl_device_id device);
+
+	static cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" );
+
+	//optional
+	static cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0);
+
+	//the following optional APIs provide access using specific platform information
+	static int getNumPlatforms(cl_int* pErrNum=0);
+	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
+	static cl_platform_id getPlatform(int nr, cl_int* pErrNum=0);
+	static void getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo& platformInfo);
+	static const char* getSdkVendorName();
+	static cl_context 	createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1);
+};
+
+
+
+#endif // BT_OPENCL_UTILS_H
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/main.cpp
@@ -0,0 +1,92 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "btOpenCLUtils.h"
+#include <stdio.h>
+
+cl_context			g_cxMainContext;
+cl_command_queue	g_cqCommandQue;
+
+
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+	
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+	const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
+
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+	int numPlatforms = btOpenCLUtils::getNumPlatforms();
+	printf("Num Platforms = %d\n", numPlatforms);
+
+	for (int i=0;i<numPlatforms;i++)
+	{
+		cl_platform_id platform = btOpenCLUtils::getPlatform(i);
+		btOpenCLPlatformInfo platformInfo;
+		btOpenCLUtils::getPlatformInfo(platform,platformInfo);
+		printf("--------------------------------\n");
+		printf("Platform info for platform nr %d:\n",i);
+		printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+		printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+		printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+		
+		cl_context context = btOpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
+		
+		int numDevices = btOpenCLUtils::getNumDevices(context);
+		printf("Num Devices = %d\n", numDevices);
+		for (int j=0;j<numDevices;j++)
+		{
+			cl_device_id dev = btOpenCLUtils::getDevice(context,j);
+			btOpenCLDeviceInfo devInfo;
+			btOpenCLUtils::getDeviceInfo(dev,devInfo);
+			btOpenCLUtils::printDeviceInfo(dev);
+		}
+
+		clReleaseContext(context);
+	}
+
+	///Easier method to initialize OpenCL using createContextFromType for a GPU
+	deviceType = CL_DEVICE_TYPE_GPU;
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
+	g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
+
+	for (int i=0;i<numDev;i++)
+	{
+		cl_device_id		device;
+		device = btOpenCLUtils::getDevice(g_cxMainContext,i);
+		btOpenCLDeviceInfo clInfo;
+		btOpenCLUtils::getDeviceInfo(device,clInfo);
+		btOpenCLUtils::printDeviceInfo(device);
+		// create a command-queue
+		g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		//normally you would create and execute kernels using this command queue
+
+		clReleaseCommandQueue(g_cqCommandQue);
+	}
+
+	clReleaseContext(g_cxMainContext);
+		
+	return 0;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/premake4.lua
@@ -0,0 +1,4 @@
+
+	include "AMD"
+	include "Intel"
+	include "NVIDIA"
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/AMD/premake4.lua
@@ -0,0 +1,49 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_broadphase_benchmark_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+		
+		includedirs {
+			"../../../rendering/BulletMath",
+			"../../primitives",
+			"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../findPairsOpenCL.cpp",
+			"../findPairsOpenCL.h",
+			"../btGridBroadphaseCL.cpp",
+			"../btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/Intel/premake4.lua
@@ -0,0 +1,49 @@
+	
+	hasCL = findOpenCL_Intel()
+	
+	if (hasCL) then
+
+		project "OpenCL_broadphase_benchmark_Intel"
+
+		initOpenCL_Intel()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../findPairsOpenCL.cpp",
+			"../findPairsOpenCL.h",
+			"../btGridBroadphaseCL.cpp",
+			"../btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/NVIDIA/premake4.lua
@@ -0,0 +1,49 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_broadphase_benchmark_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../findPairsOpenCL.cpp",
+			"../findPairsOpenCL.h",
+			"../btGridBroadphaseCL.cpp",
+			"../btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Show More
+++ b/Show More