Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/AMD/premake4.lua
@@ -0,0 +1,45 @@
+if os.is("Windows") then
+	
+		hasCL = findOpenCL_AMD()
+	
+		if (hasCL) then
+	
+		project "basic_bullet2_demo_AMD"
+
+		initOpenCL_AMD()
+				
+		language "C++"
+		
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+  		includedirs {
+                "..",
+                "../../../bullet2",
+                "../../testbed",
+                "../../../rendering/Gwen",
+                "../../../opencl/basic_initialize",
+                "../../../opencl/primitives"
+                }
+		
+
+		links { "testbed",
+			"bullet2",
+			"gwen"
+		}
+		
+	
+		initOpenGL()
+		initGlut()
+
+	
+		files {
+		"../**.cpp",
+		"../**.h",
+		"../../../opencl/basic_initialize/btOpenCLUtils.cpp",
+		"../../../opencl/basic_initialize/btOpenCLUtils.h"
+		}
+
+	end
+	
+end
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.cpp
@@ -0,0 +1,538 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "BasicDemo.h"
+#include "GlutStuff.h"
+///btBulletDynamicsCommon.h is the main Bullet include file, contains most common include files.
+#include "btBulletDynamicsCommon.h"
+#include "CustomConvexShape.h"
+#include "CustomConvexPairCollision.h"
+#include "CustomCollisionDispatcher.h"
+
+#include "ConvexHeightFieldShape.h"
+#include "GLDebugDrawer.h"
+static GLDebugDrawer sDebugDraw;
+
+#include <stdio.h> //printf debugging
+
+#ifdef CL_PLATFORM_AMD
+#include "../../opencl/basic_initialize/btOpenCLUtils.h"
+
+cl_context			g_cxMainContext=0;
+cl_command_queue	g_cqCommandQue=0;
+cl_device_id		g_clDevice=0;
+#endif
+
+///create 125 (5x5x5) dynamic object
+#define ARRAY_SIZE_X 6
+#define ARRAY_SIZE_Y 6
+#define ARRAY_SIZE_Z 4
+
+//maximum number of objects (and allow user to shoot additional boxes)
+#define MAX_PROXIES (ARRAY_SIZE_X*ARRAY_SIZE_Y*ARRAY_SIZE_Z + 1024)
+
+///scaling of the objects (0.1 = 20 centimeter boxes )
+#define SCALING 1.
+#define START_POS_X 0
+#define START_POS_Y -0.8
+#define START_POS_Z 0
+
+#define BoxVtxCount 8
+
+static float BoxVtx[] = {
+-0.5,-0.5,-0.5,
+-0.5,-0.5,0.5,
+-0.5,0.5,-0.5,
+-0.5,0.5,0.5,
+0.5,-0.5,-0.5,
+0.5,-0.5,0.5,
+0.5,0.5,-0.5,
+0.5,0.5,0.5,
+};
+
+static float BoxVtx2[] = {
+-20.3,-10.3,-20.3,
+-20.3,-10.3,20.3,
+-20.3,10.3,-20.3,
+-20.3,10.3,20.3,
+20.3,-10.3,-20.3,
+20.3,-10.3,20.3,
+20.3,10.3,-20.3,
+20.3,10.3,20.3,
+};
+
+
+#define BarrelVtxCount2 57
+
+static float BarrelVtx2[] = {
+0.0f,-0.5f,0.0f,				0.0f,-1.0f,0.0f,
+0.282362f,-0.5f,-0.205148f,     0.0f,-1.0f,0.0f,
+0.349018f,-0.5f,0.0f,           0.0f,-1.0f,0.0f,
+0.107853f,-0.5f,-0.331936f,     0.0f,-1.0f,0.0f,
+-0.107853f,-0.5f,-0.331936f,    0.0f,-1.0f,0.0f,
+0.107853f,-0.5f,-0.331936f,     0.0f,-1.0f,0.0f,
+-0.282362f,-0.5f,-0.205148f,    0.0f,-1.0f,0.0f,
+-0.349018f,-0.5f,0.0f,          0.0f,-1.0f,0.0f,
+-0.282362f,-0.5f,0.205148f,     0.0f,-1.0f,0.0f,
+-0.107853f,-0.5f,0.331936f,     0.0f,-1.0f,0.0f,
+0.107853f,-0.5f,0.331936f,      0.0f,-1.0f,0.0f,
+0.282362f,-0.5f,0.205148f,      0.0f,-1.0f,0.0f,
+0.0f,0.5f,0.0f,                 0.0f,1.0f,0.0f,
+0.349018f,0.5f,0.0f,            0.0f,1.0f,0.0f,
+0.282362f,0.5f,-0.205148f,      0.0f,1.0f,0.0f,
+0.107853f,0.5f,-0.331936f,      0.0f,1.0f,0.0f,
+0.107853f,0.5f,-0.331936f,      0.0f,1.0f,0.0f,
+-0.107853f,0.5f,-0.331936f,     0.0f,1.0f,0.0f,
+-0.282362f,0.5f,-0.205148f,     0.0f,1.0f,0.0f,
+-0.349018f,0.5f,0.0f,           0.0f,1.0f,0.0f,
+-0.282362f,0.5f,0.205148f,      0.0f,1.0f,0.0f,
+-0.107853f,0.5f,0.331936f,      0.0f,1.0f,0.0f,
+0.107853f,0.5f,0.331936f,       0.0f,1.0f,0.0f,
+0.282362f,0.5f,0.205148f,       0.0f,1.0f,0.0f,
+0.349018f,-0.5f,0.0f,           0.957307f,-0.289072f,0.0f,
+0.404509f,0.0f,-0.293893f,      0.809017f,0.0f,-0.587785f,
+0.5f,0.0f,0.0f,                 1.0f,0.0f,0.0f,
+0.282362f,-0.5f,-0.205148f,     0.774478f,-0.289072f,-0.562691f,
+0.154508f,0.0f,-0.475528f,      0.309017f,0.0f,-0.951057f,
+0.107853f,-0.5f,-0.331936f,     0.295824f,-0.289072f,-0.910453f,
+0.107853f,-0.5f,-0.331936f,     0.295824f,-0.289072f,-0.910453f,
+-0.154509f,0.0f,-0.475528f,     -0.309017f,0.0f,-0.951057f,
+0.154508f,0.0f,-0.475528f,      0.309017f,0.0f,-0.951057f,
+-0.107853f,-0.5f,-0.331936f,    -0.295824f,-0.289072f,-0.910453f,
+-0.404509f,0.0f,-0.293893f,     -0.809017f,0.0f,-0.587785f,
+-0.282362f,-0.5f,-0.205148f,    -0.774478f,-0.289072f,-0.562691f,
+-0.5f,0.0f,0.0f,                -1.0f,0.0f,0.0f,
+-0.349018f,-0.5f,0.0f,          -0.957307f,-0.289072f,0.0f,
+-0.404508f,0.0f,0.293893f,      -0.809017f,0.0f,0.587785f,
+-0.282362f,-0.5f,0.205148f,     -0.774478f,-0.289072f,0.562691f,
+-0.154509f,0.0f,0.475528f,      -0.309017f,0.0f,0.951056f,
+-0.107853f,-0.5f,0.331936f,     -0.295824f,-0.289072f,0.910453f,
+0.154509f,0.0f,0.475528f,       0.309017f,0.0f,0.951056f,
+0.107853f,-0.5f,0.331936f,      0.295824f,-0.289072f,0.910453f,
+0.404509f,0.0f,0.293892f,       0.809017f,0.0f,0.587785f,
+0.282362f,-0.5f,0.205148f,      0.774478f,-0.289072f,0.562691f,
+0.282362f,0.5f,-0.205148f,      0.774478f,0.289072f,-0.562691f,
+0.349018f,0.5f,0.0f,            0.957307f,0.289072f,0.0f,
+0.107853f,0.5f,-0.331936f,      0.295824f,0.289072f,-0.910453f,
+-0.107853f,0.5f,-0.331936f,     -0.295824f,0.289072f,-0.910453f,
+0.107853f,0.5f,-0.331936f,      0.295824f,0.289072f,-0.910453f,
+-0.282362f,0.5f,-0.205148f,     -0.774478f,0.289072f,-0.562691f,
+-0.349018f,0.5f,0.0f,           -0.957307f,0.289072f,0.0f,
+-0.282362f,0.5f,0.205148f,      -0.774478f,0.289072f,0.562691f,
+-0.107853f,0.5f,0.331936f,      -0.295824f,0.289072f,0.910453f,
+0.107853f,0.5f,0.331936f,       0.295824f,0.289072f,0.910453f,
+0.282362f,0.5f,0.205148f,       0.774478f,0.289072f,0.562691f,
+};
+
+
+static int BarrelIdx[] = {
+0,1,2,
+0,3,1,
+0,4,5,
+0,6,4,
+0,7,6,
+0,8,7,
+0,9,8,
+0,10,9,
+0,11,10,
+0,2,11,
+12,13,14,
+12,14,15,
+12,16,17,
+12,17,18,
+12,18,19,
+12,19,20,
+12,20,21,
+12,21,22,
+12,22,23,
+12,23,13,
+24,25,26,
+24,27,25,
+27,28,25,
+27,29,28,
+30,31,32,
+30,33,31,
+33,34,31,
+33,35,34,
+35,36,34,
+35,37,36,
+37,38,36,
+37,39,38,
+39,40,38,
+39,41,40,
+41,42,40,
+41,43,42,
+43,44,42,
+43,45,44,
+45,26,44,
+45,24,26,
+26,46,47,
+26,25,46,
+25,48,46,
+25,28,48,
+32,49,50,
+32,31,49,
+31,51,49,
+31,34,51,
+34,52,51,
+34,36,52,
+36,53,52,
+36,38,53,
+38,54,53,
+38,40,54,
+40,55,54,
+40,42,55,
+42,56,55,
+42,44,56,
+44,47,56,
+44,26,47,
+};
+
+
+__inline void glVertexFloat4( const float4& v )
+{
+	glVertex3f( v.x, v.y, v.z );
+}
+
+__inline void drawPointListTransformed(const float4* vtx,  int nVtx, const float4& translation, const Quaternion& quat)
+{
+	glPushMatrix();
+
+	Matrix3x3 rotMat = mtTranspose( qtGetRotationMatrix( quat ) );
+	float transformMat[16] =
+	{
+		rotMat.m_row[0].x, rotMat.m_row[0].y, rotMat.m_row[0].z, 0,
+		rotMat.m_row[1].x, rotMat.m_row[1].y, rotMat.m_row[1].z, 0,
+		rotMat.m_row[2].x, rotMat.m_row[2].y, rotMat.m_row[2].z, 0,
+		translation.x, translation.y, translation.z,1
+	};
+
+	glMultMatrixf( transformMat );
+
+	float4 c = make_float4(1,1,0,0);
+
+	glPointSize(3.f);
+	glBegin(GL_POINTS);
+	for(int i=0; i<nVtx; i++)
+	{
+		glColor4f(c.x, c.y, c.z, 1);
+		glVertexFloat4( vtx[i] );
+	}
+	glEnd();
+
+	glPopMatrix();
+}
+void displaySamples(const float4* vertices, int numVertices, const float4& translation, const Quaternion& quaternion) 
+{
+	drawPointListTransformed( vertices,numVertices, translation, quaternion );
+}
+
+
+
+void BasicDemo::renderSurfacePoints()
+{
+	if (m_dynamicsWorld->getDebugDrawer()->getDebugMode()& btIDebugDraw::DBG_DrawContactPoints)
+	for (int i=0;i<m_dynamicsWorld->getCollisionObjectArray().size();i++)
+	{
+		btCollisionObject* ob = m_dynamicsWorld->getCollisionObjectArray()[i];
+		if (ob->getCollisionShape()->getShapeType() == CUSTOM_POLYHEDRAL_SHAPE_TYPE)
+		{
+			CustomConvexShape* customConvex = (CustomConvexShape*)ob->getCollisionShape();
+			ConvexHeightField* cvxShape= customConvex->m_ConvexHeightField;
+			if (!cvxShape)
+			{
+				printf("aargh\n");
+			}
+
+				float4 bodyApos;
+			Quaternion bodyAquat;
+
+	
+	const btVector3& pA = ob->getWorldTransform().getOrigin();
+	btQuaternion qA = ob->getWorldTransform().getRotation();
+	
+	bodyApos.x = pA.getX();
+	bodyApos.y = pA.getY();
+	bodyApos.z = pA.getZ();
+	bodyApos.w = 0.f;
+	bodyAquat.x = qA.getX();
+	bodyAquat.y = qA.getY();
+	bodyAquat.z = qA.getZ();
+	bodyAquat.w = qA.getW();
+
+
+	displaySamples(cvxShape->getSamplePoints(),cvxShape->getNumSamplePoints(),bodyApos,bodyAquat);
+
+		}
+
+	}
+}
+void BasicDemo::clientMoveAndDisplay()
+{
+	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); 
+
+	//simple dynamics world doesn't handle fixed-time-stepping
+	float ms = getDeltaTimeMicroseconds();
+	
+	///step the simulation
+	if (m_dynamicsWorld)
+	{
+		m_dynamicsWorld->stepSimulation(ms / 1000000.f);
+		//optional but useful: debug drawing
+		m_dynamicsWorld->debugDrawWorld();
+	}
+		
+	renderme(); 
+
+	renderSurfacePoints();
+
+
+	glFlush();
+
+	swapBuffers();
+
+}
+
+
+
+void BasicDemo::displayCallback(void) {
+
+	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); 
+	
+	renderme();
+
+	renderSurfacePoints();
+
+	//optional but useful: debug drawing to detect problems
+	if (m_dynamicsWorld)
+		m_dynamicsWorld->debugDrawWorld();
+
+	glFlush();
+	swapBuffers();
+}
+
+
+
+
+
+void	BasicDemo::initPhysics()
+{
+	setTexturing(true);
+	setShadows(true);
+
+	m_acceleratedRigidBodies = 0;
+
+	setCameraDistance(btScalar(SCALING*20.));
+
+	///collision configuration contains default setup for memory, collision setup
+	m_collisionConfiguration = new btDefaultCollisionConfiguration();
+	//m_collisionConfiguration->setConvexConvexMultipointIterations();
+
+	///use the default collision dispatcher. For parallel processing you can use a diffent dispatcher (see Extras/BulletMultiThreaded)
+	m_dispatcher = new	btCollisionDispatcher(m_collisionConfiguration);
+
+	
+#ifdef CL_PLATFORM_AMD
+	m_dispatcher = new	CustomCollisionDispatcher(m_collisionConfiguration,	g_cxMainContext,g_clDevice,g_cqCommandQue);
+#else
+	m_dispatcher = new	CustomCollisionDispatcher(m_collisionConfiguration);
+#endif
+
+	m_dispatcher->registerCollisionCreateFunc(CUSTOM_POLYHEDRAL_SHAPE_TYPE,CUSTOM_POLYHEDRAL_SHAPE_TYPE,new CustomConvexConvexPairCollision::CreateFunc(m_collisionConfiguration->getSimplexSolver(), m_collisionConfiguration->getPenetrationDepthSolver()));
+
+	m_broadphase = new btDbvtBroadphase();
+
+	///the default constraint solver. For parallel processing you can use a different solver (see Extras/BulletMultiThreaded)
+	btSequentialImpulseConstraintSolver* sol = new btSequentialImpulseConstraintSolver;
+	m_solver = sol;
+
+	m_dynamicsWorld = new btDiscreteDynamicsWorld(m_dispatcher,m_broadphase,m_solver,m_collisionConfiguration);
+	
+	m_dynamicsWorld->setGravity(btVector3(0,-10,0));
+
+	m_dynamicsWorld->setDebugDrawer(&sDebugDraw);
+
+	///create a few basic rigid bodies
+	//btCollisionShape* groundShape = new btBoxShape(btVector3(btScalar(50.),btScalar(50.),btScalar(50.)));
+#if 1
+	CustomConvexShape* groundShape = new CustomConvexShape(BoxVtx2,BoxVtxCount,3*sizeof(float));
+	//btCollisionShape* groundShape = new btStaticPlaneShape(btVector3(0,1,0),0);
+	
+	m_collisionShapes.push_back(groundShape);
+
+	btTransform groundTransform;
+	groundTransform.setIdentity();
+	groundTransform.setOrigin(btVector3(0,-11,0));
+
+	//We can also use DemoApplication::localCreateRigidBody, but for clarity it is provided here:
+	{
+		btScalar mass(0.);
+
+		//rigidbody is dynamic if and only if mass is non zero, otherwise static
+		bool isDynamic = (mass != 0.f);
+
+		btVector3 localInertia(0,0,0);
+		if (isDynamic)
+			groundShape->calculateLocalInertia(mass,localInertia);
+
+		//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
+		btDefaultMotionState* myMotionState = new btDefaultMotionState(groundTransform);
+		btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,groundShape,localInertia);
+		btRigidBody* body = new btRigidBody(rbInfo);
+
+		//add the body to the dynamics world
+		m_dynamicsWorld->addRigidBody(body);
+	}
+#endif
+
+
+	{
+		//create a few dynamic rigidbodies
+		// Re-using the same collision is better for memory usage and performance
+
+		//btCollisionShape* colShape = new btBoxShape(btVector3(SCALING*1,SCALING*1,SCALING*1));
+		//btCollisionShape* colShape = new btSphereShape(btScalar(1.));
+#define USE_CUSTOM_HEIGHTFIELD_SHAPE 
+#ifdef USE_CUSTOM_HEIGHTFIELD_SHAPE
+	CustomConvexShape* colShape = new CustomConvexShape(BarrelVtx2,BarrelVtxCount2,6*sizeof(float));
+
+	//CustomConvexShape* colShape = new CustomConvexShape(BoxVtx,BoxVtxCount,3*sizeof(float));
+#else
+	btConvexHullShape* colShape = new btConvexHullShape(BarrelVtx2,BarrelVtxCount2,6*sizeof(float));
+		colShape->setLocalScaling(btVector3(0.9,0.9,0.9));
+
+#endif //USE_CUSTOM_HEIGHTFIELD_SHAPE
+	btScalar scale = 0.5f;
+	
+	//btScalar scale = 1.f;
+
+		//next line is already called inside the CustomConvexShape constructor
+		//colShape->initializePolyhedralFeatures();
+
+		m_collisionShapes.push_back(colShape);
+
+		/// Create Dynamic Objects
+		btTransform startTransform;
+		startTransform.setIdentity();
+
+		btScalar	mass(1.f);
+
+		//rigidbody is dynamic if and only if mass is non zero, otherwise static
+		bool isDynamic = (mass != 0.f);
+
+		btVector3 localInertia(0,0,0);
+		if (isDynamic)
+			colShape->calculateLocalInertia(mass,localInertia);
+
+		float start_x = START_POS_X - ARRAY_SIZE_X/2;
+		float start_y = START_POS_Y;
+		float start_z = START_POS_Z - ARRAY_SIZE_Z/2;
+
+		for (int k=0;k<ARRAY_SIZE_Y;k++)
+		{
+			for(int j = 0;j<ARRAY_SIZE_Z;j++)	
+			{
+				for (int i=0;i<ARRAY_SIZE_X;i++)
+				{
+					
+					{
+					//	if ((k>0) && ((j<2) || (j>(ARRAY_SIZE_Z-3))))
+					//		continue;
+					//	if ((k>0) && ((i<2) || (i>(ARRAY_SIZE_X-3))))
+					//		continue;
+
+					startTransform.setOrigin(SCALING*btVector3(
+										btScalar(scale*2.0*i + start_x),
+										btScalar(scale*1+scale*2.0*k + start_y),
+										btScalar(scale*2.0*j + start_z)));
+
+			
+					//using motionstate is recommended, it provides interpolation capabilities, and only synchronizes 'active' objects
+					btDefaultMotionState* myMotionState = new btDefaultMotionState(startTransform);
+					btRigidBody* body=0;
+
+					if (0)//k==0)
+					{
+						btVector3 zeroInertia(0,0,0);
+						btRigidBody::btRigidBodyConstructionInfo rbInfo(0.f,myMotionState,colShape,zeroInertia);
+						body = new btRigidBody(rbInfo);
+					} else
+					{
+						btRigidBody::btRigidBodyConstructionInfo rbInfo(mass,myMotionState,colShape,localInertia);
+						body = new btRigidBody(rbInfo);
+					}
+
+					//m_acceleratedRigidBodies is used as a mapping to the accelerated rigid body index
+					body->setCompanionId(m_acceleratedRigidBodies++);
+					m_dynamicsWorld->addRigidBody(body);
+						
+					}
+				}
+			}
+		}
+	}
+
+
+}
+void	BasicDemo::clientResetScene()
+{
+	exitPhysics();
+	initPhysics();
+}
+	
+
+void	BasicDemo::exitPhysics()
+{
+
+	//cleanup in the reverse order of creation/initialization
+
+	//remove the rigidbodies from the dynamics world and delete them
+	int i;
+	for (i=m_dynamicsWorld->getNumCollisionObjects()-1; i>=0 ;i--)
+	{
+		btCollisionObject* obj = m_dynamicsWorld->getCollisionObjectArray()[i];
+		btRigidBody* body = btRigidBody::upcast(obj);
+		if (body && body->getMotionState())
+		{
+			delete body->getMotionState();
+		}
+		m_dynamicsWorld->removeCollisionObject( obj );
+		delete obj;
+	}
+
+	//delete collision shapes
+	for (int j=0;j<m_collisionShapes.size();j++)
+	{
+		btCollisionShape* shape = m_collisionShapes[j];
+		delete shape;
+	}
+	m_collisionShapes.clear();
+
+	delete m_dynamicsWorld;
+	
+	delete m_solver;
+	
+	delete m_broadphase;
+	
+	delete m_dispatcher;
+
+	delete m_collisionConfiguration;
+
+	
+}
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/BasicDemo.h
@@ -0,0 +1,86 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef BASIC_DEMO_H
+#define BASIC_DEMO_H
+
+#ifdef _WINDOWS
+#include "Win32DemoApplication.h"
+#define PlatformDemoApplication Win32DemoApplication
+#else
+#include "GlutDemoApplication.h"
+#define PlatformDemoApplication GlutDemoApplication
+#endif
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+class btBroadphaseInterface;
+class btCollisionShape;
+class btOverlappingPairCache;
+class btCollisionDispatcher;
+class btConstraintSolver;
+struct btCollisionAlgorithmCreateFunc;
+class btDefaultCollisionConfiguration;
+
+///BasicDemo is good starting point for learning the code base and porting.
+
+class BasicDemo : public PlatformDemoApplication
+{
+
+	//keep the collision shapes, for deletion/cleanup
+	btAlignedObjectArray<btCollisionShape*>	m_collisionShapes;
+
+	btBroadphaseInterface*	m_broadphase;
+
+	btCollisionDispatcher*	m_dispatcher;
+
+	btConstraintSolver*	m_solver;
+
+	btDefaultCollisionConfiguration* m_collisionConfiguration;
+
+	int m_acceleratedRigidBodies;
+
+	public:
+
+	BasicDemo()
+	{
+	}
+	virtual ~BasicDemo()
+	{
+		exitPhysics();
+	}
+	void	initPhysics();
+
+	void	exitPhysics();
+
+	virtual void clientMoveAndDisplay();
+
+	virtual void displayCallback();
+	virtual void	clientResetScene();
+	
+	static DemoApplication* Create()
+	{
+		BasicDemo* demo = new BasicDemo;
+		demo->myinit();
+		demo->initPhysics();
+		return demo;
+	}
+
+	void renderSurfacePoints();
+
+	
+};
+
+#endif //BASIC_DEMO_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.cpp
@@ -0,0 +1,507 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include "ConvexHeightFieldShape.h"
+#include "Stubs/AdlCollideUtils.h"
+#include "CubeMapUtils.h"
+//#include <common/Physics/ShapeBase.h>
+//#include <common/Physics/SphereShape.h>
+//#include "GlutStuff.h"
+
+//#define USE_OLD
+
+ConvexHeightField::ConvexHeightField(const float4* vtxBuffer, const int4* idxBuffer, int nTriangles)
+: CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD )
+{
+	create( vtxBuffer, idxBuffer, nTriangles );
+}
+
+void ConvexHeightField::create( const float4* vtxBuffer, const int4* idxBuffer, int nTriangles )
+{
+	{
+		float maxDx2 = -1.f;
+		int maxIdx = -1;
+		for(int i=0; i<nTriangles; i++)
+		{
+			const int4& idx = idxBuffer[i];
+			for(int j=0; j<3; j++)
+			{
+				float dx2 = dot3F4( vtxBuffer[idx.s[j]], vtxBuffer[idx.s[j]] );
+				if( dx2 > maxDx2 )
+				{
+					maxDx2 = dx2;
+					maxIdx = idx.s[j];
+				}
+			}
+		}
+		ADLASSERT( maxIdx != -1 );
+		m_scale = sqrtf( maxDx2 );
+	}
+
+	//	cast ray to find intersectPlaneLineions
+	{
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4 v;
+				float x = (i+0.5f)/(float)HEIGHT_RES;
+				float y = (j+0.5f)/(float)HEIGHT_RES;
+				v = CubeMapUtils::calcVector(faceIdx, x, y);
+				v = normalize3( v );
+				v *= m_scale;
+
+				float minFraction = FLT_MAX;
+				float4 minNormal;
+				float4 minBCrd;
+				for(int itri=0; itri<nTriangles; itri++)
+				{
+					float4 from = make_float4(0.f);
+					float4 bCrd;
+					float fraction = CollideUtils::castRay( vtxBuffer[idxBuffer[itri].x], vtxBuffer[idxBuffer[itri].y], vtxBuffer[idxBuffer[itri].z], 
+						from, v, 0.0f, &bCrd );
+
+					if( fraction > 0.f )
+					{
+						minFraction = min2( minFraction, fraction );	//	todo. have to check if this is the min to replace normal?
+						float4 ab = vtxBuffer[idxBuffer[itri].y]-vtxBuffer[idxBuffer[itri].x];
+						float4 ac = vtxBuffer[idxBuffer[itri].z]-vtxBuffer[idxBuffer[itri].x];
+						minNormal = cross3( ab, ac );
+						minBCrd = bCrd;
+					}
+				}
+
+				if( minFraction == FLT_MAX )
+					minFraction = 0.f;
+
+				{
+					u8 quantizedHeight = (u8)(minFraction*255.f);
+					sample( (Face)faceIdx, i,j ) = quantizedHeight;
+					sampleNormal( (Face)faceIdx, i,j ) = normalize3(minNormal);
+					float minValue = 3.f*(1.f/3.f)*(1.f/3.f);
+					sampleNormal( (Face)faceIdx, i,j ).w = (dot3F4( minBCrd, minBCrd ) - minValue )/(1.f-minValue);
+				}
+			}
+		}
+	}
+
+	calcSamplePoints( m_samplePoints );
+
+	//	calc support height using m_samplePoints
+	{
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++) for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+		{
+			float4 v;
+			float x = (i+0.5f)/(float)HEIGHT_RES;
+			float y = (j+0.5f)/(float)HEIGHT_RES;
+			v = CubeMapUtils::calcVector(faceIdx, x, y);
+			v = normalize3( v );
+
+			float maxHeight = -1;
+			for(int ie=0; ie<6*HEIGHT_RES*HEIGHT_RES; ie++)
+			{
+				float h = dot3F4( v, m_samplePoints[ie] )/m_scale;
+				ADLASSERT( h <= 1.f );
+				if( h > maxHeight ) maxHeight = h;
+			}
+
+			{
+				u8 quantizedHeight = min2((u8)(maxHeight*255.f)+1, 255);
+				sampleSupport( (Face)faceIdx, i, j ) = quantizedHeight;
+			}
+		}
+	}
+
+	m_aabb.setEmpty();
+	for(int i=0; i<nTriangles; i++)
+	{
+		const int4& idx = idxBuffer[i];
+		m_aabb.includePoint( vtxBuffer[idx.x] );
+		m_aabb.includePoint( vtxBuffer[idx.y] );
+		m_aabb.includePoint( vtxBuffer[idx.z] );
+	}
+	m_aabb.expandBy( make_float4( m_collisionMargin ) );
+
+	for(int i=0; i<6; i++)
+	{
+		m_faceAabbs[i].setEmpty();
+		for(int j=0; j<HEIGHT_RES*HEIGHT_RES; j++)
+		{
+			float4 p = m_samplePoints[i*HEIGHT_RES*HEIGHT_RES + j];
+			m_faceAabbs[i].includePoint(p);
+		}
+		m_faceAabbs[i].expandBy( make_float4( m_collisionMargin ) );
+	}
+}
+
+static __inline float localIntersectPlaneLine( const float4& planeEqn, const float4& vec, const float4& orig )
+{
+	return (-planeEqn.w - dot3F4(planeEqn, orig))/dot3F4(planeEqn, vec);
+}
+
+
+ConvexHeightField::ConvexHeightField(const float4* eqn, int nEqn)
+	: CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD )
+{
+	{	//	cast ray to find intersectPlaneLineions
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4 v;
+				float x = (i+0.5f)/(float)HEIGHT_RES;
+				float y = (j+0.5f)/(float)HEIGHT_RES;
+				v = CubeMapUtils::calcVector(faceIdx, x, y);
+				v = normalize3( v );
+
+				float minFraction = FLT_MAX;
+				float4 minNormal;
+				for(int ii=0; ii<nEqn; ii++)
+				{
+					const float4& iEqn = eqn[ii];
+
+					float fraction = localIntersectPlaneLine( iEqn, v, make_float4(0.f) );
+
+					if( fraction > 0.f )
+					{
+						if( fraction < minFraction )
+						{
+							minFraction = fraction;
+							minNormal = iEqn;
+						}
+					}
+				}
+
+				ADLASSERT( minFraction != FLT_MAX );
+
+				minNormal.w = minFraction;
+				sampleNormal( (Face)faceIdx, i, j ) = minNormal;
+			}
+		}
+	}
+
+	{
+		m_scale = -FLT_MAX;
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4& n = sampleNormal( (Face)faceIdx, i, j );
+
+				m_scale = max2( m_scale, n.w );
+			}
+		}
+		
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4& n = sampleNormal( (Face)faceIdx, i, j );
+				u8 quantizedHeight = (u8)(n.w/m_scale*255.f);
+				sample( (Face)faceIdx, i, j ) = quantizedHeight;
+			}
+		}
+	}
+
+	calcSamplePoints( m_samplePoints );
+
+	//	calc support height using m_samplePoints
+	{
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++) for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+		{
+			float4 v;
+			float x = (i+0.5f)/(float)HEIGHT_RES;
+			float y = (j+0.5f)/(float)HEIGHT_RES;
+			v = CubeMapUtils::calcVector(faceIdx, x, y);
+			v = normalize3( v );
+
+			float maxHeight = -1;
+			for(int ie=0; ie<6*HEIGHT_RES*HEIGHT_RES; ie++)
+			{
+				float h = dot3F4( v, m_samplePoints[ie] )/m_scale;
+				if (h>1.f)
+					h=1.f;
+//				ADLASSERT( h <= 1.f );
+				if( h > maxHeight ) maxHeight = h;
+			}
+
+			{
+				u8 quantizedHeight = min2((u8)(maxHeight*255.f)+1, 255);
+				sampleSupport( (Face)faceIdx, i, j ) = quantizedHeight;
+			}
+		}
+	}
+
+	for(int i=0; i<6; i++)
+	{
+		m_faceAabbs[i].setEmpty();
+		for(int j=0; j<HEIGHT_RES*HEIGHT_RES; j++)
+		{
+			float4 p = m_samplePoints[i*HEIGHT_RES*HEIGHT_RES + j];
+			m_faceAabbs[i].includePoint(p);
+		}
+		m_faceAabbs[i].expandBy( make_float4( m_collisionMargin ) );
+	}
+
+	m_aabb.setEmpty();
+	for(int i=0; i<6; i++)
+	{
+		m_aabb.includeVolume( m_faceAabbs[i] );
+	}
+}
+
+#if 0
+ConvexHeightField::ConvexHeightField(const ShapeBase* shape)
+	: CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD )
+{
+	if( shape->m_type == ADL_SHAPE_SPHERE )
+	{
+		SphereShape* sphere = (SphereShape*)shape;
+
+		m_scale = sphere->m_radius;
+		for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+		{
+			for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+			{
+				float4 minNormal;
+				float x = (i+0.5f)/(float)HEIGHT_RES;
+				float y = (j+0.5f)/(float)HEIGHT_RES;
+				minNormal = CubeMapUtils::calcVector(faceIdx, x, y);
+				minNormal = normalize3( minNormal );
+				{
+					u8 quantizedHeight = (u8)(1.f*255.f);
+					sample( (Face)faceIdx, i,j ) = quantizedHeight;
+					sampleNormal( (Face)faceIdx, i,j ) = normalize3(minNormal);
+//					float minValue = 3.f*(1.f/3.f)*(1.f/3.f);
+//					sampleNormal( (Face)faceIdx, i,j ).w = (dot3F4( minBCrd, minBCrd ) - minValue )/(1.f-minValue);
+				}
+			}
+		}
+
+		calcSamplePoints( m_samplePoints );
+
+		m_aabb.m_max = make_float4( sphere->m_radius );
+		m_aabb.m_min = make_float4( -sphere->m_radius );
+
+		m_aabb.expandBy( make_float4( m_collisionMargin ) );
+
+		for(int i=0; i<6; i++)
+		{
+			m_faceAabbs[i].setEmpty();
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES; j++)
+			{
+				float4 p = m_samplePoints[i*HEIGHT_RES*HEIGHT_RES + j];
+				m_faceAabbs[i].includePoint(p);
+			}
+			m_faceAabbs[i].expandBy( make_float4( m_collisionMargin ) );
+		}
+	}
+	else
+	{
+		ShapeBase* s = (ShapeBase*)shape;
+
+		create( s->getVertexBuffer(), s->getTriangleBuffer(), s->getNumTris() );
+	}
+}
+#endif
+
+ConvexHeightField::~ConvexHeightField()
+{
+
+}
+
+float ConvexHeightField::queryDistance(const float4& p ) const
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+	if( dot3F4( p, p ) >= m_scale*m_scale ) return FLT_MAX;
+
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	float height;
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+		float dx = x-xi;
+		float dy = y-yi;
+
+		{
+			int xip = min2((int)(HEIGHT_RES-1), xi+1);
+			int yip = min2((int)(HEIGHT_RES-1), yi+1);
+
+			u8 xy = sample( (Face)faceIdx, xi, yi );
+			u8 xpy = sample( (Face)faceIdx, xip, yi );
+			u8 xpyp = sample( (Face)faceIdx, xip, yip );
+			u8 xyp = sample( (Face)faceIdx, xi, yip );
+
+			height = (xy*(1.f-dx)+xpy*dx)*(1.f-dy) + (xyp*(1.f-dx)+xpyp*dx)*dy;
+			height = height/255.f*m_scale;
+
+			height = length3( p ) - height;
+		}
+	}
+
+	return height;
+}
+
+float ConvexHeightField::querySupportHeight(const float4& p ) const
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+//	if( dot3F4( p, p ) >= m_scale*m_scale ) return FLT_MAX;
+
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	float height;
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+		float dx = x-xi;
+		float dy = y-yi;
+
+		{
+			int xip = min2((int)(HEIGHT_RES-1), xi+1);
+			int yip = min2((int)(HEIGHT_RES-1), yi+1);
+
+			u8 xy = sampleSupport( (Face)faceIdx, xi, yi );
+			u8 xpy = sampleSupport( (Face)faceIdx, xip, yi );
+			u8 xpyp = sampleSupport( (Face)faceIdx, xip, yip );
+			u8 xyp = sampleSupport( (Face)faceIdx, xi, yip );
+
+			height = max2( xy, max2( xpy, max2( xpyp, xyp ) ) );
+			height = height/255.f*m_scale;
+		}
+	}
+
+	return height;
+}
+
+float ConvexHeightField::queryW(const float4& p ) const
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+	float value;
+	if( dot3F4( p, p ) >= m_scale*m_scale ) return 0;
+
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+
+		value = sampleNormal( (Face)faceIdx, xi, yi ).w;
+	}
+	return value;
+}
+
+bool ConvexHeightField::queryDistanceWithNormal( const float4& p, float4& normalOut ) const
+{
+	int faceIdx;
+	float x, y;
+	CubeMapUtils::calcCrd( p, faceIdx, x, y );
+	x = (x*HEIGHT_RES) - 0.5f;
+	y = (y*HEIGHT_RES) - 0.5f;
+
+	{
+		int xi = (int)(x);
+		int yi = (int)(y);
+
+		normalOut = sampleNormal( (Face)faceIdx, xi, yi );
+	}
+	return true;
+}
+
+void ConvexHeightField::calcSamplePoints(float4* points) const
+{
+	for(u32 faceIdx=0; faceIdx<6; faceIdx++)
+	{
+		for(int i=0; i<HEIGHT_RES; i++) for(int j=0; j<HEIGHT_RES; j++)
+		{
+			float4 v;
+			float x = (i+0.5f)/(float)HEIGHT_RES;
+			float y = (j+0.5f)/(float)HEIGHT_RES;
+			v = CubeMapUtils::calcVector(faceIdx, x, y);
+			v = normalize3( v );
+
+			int quantizedHeight = sample( (Face)faceIdx, i, j );
+			float rheight = quantizedHeight/255.f*m_scale;
+
+			points[ HEIGHT_RES*HEIGHT_RES*faceIdx + i + j*HEIGHT_RES ] = rheight*v;
+		}
+	}
+	return;
+}
+
+float4 ConvexHeightField::calcSamplePoint( int sIdx ) const
+{
+	int idir; int plus;
+	Face faceIdx = (Face)(sIdx/(HEIGHT_RES*HEIGHT_RES));
+	idir = (faceIdx/2);
+	plus = faceIdx & 1;
+
+	float4 viewVector = make_float4((idir==0)?1.f:0.f, (idir==1)?1.f:0.f, (idir==2)?1.f:0.f );
+	if( plus==0 ) viewVector *= -1.f;
+	float4 xVector = make_float4( viewVector.z, viewVector.x, viewVector.y );
+	float4 yVector = make_float4( viewVector.y, viewVector.z, viewVector.x );
+	float4 orig = viewVector-xVector-yVector;
+
+	int pIdx = sIdx%(HEIGHT_RES*HEIGHT_RES);
+	int i = pIdx/HEIGHT_RES;
+	int j = pIdx%HEIGHT_RES;
+
+	float4 v = orig + (i+0.5f)*xVector/(HEIGHT_RES*0.5f) + (j+0.5f)*yVector/(HEIGHT_RES*0.5f);
+	v = normalize3( v );
+
+	int quantizedHeight = sample( faceIdx, i, j );
+	float rheight = quantizedHeight/255.f*m_scale;
+	return rheight*v;
+}
+
+const float4* ConvexHeightField::getSamplePoints() const
+{
+	return m_samplePoints;
+}
+
+int ConvexHeightField::getNumSamplePoints() const
+{
+	return HEIGHT_RES*HEIGHT_RES*6;
+}
+
+__inline
+float4 rainbowMap( float s )
+{
+	float c = 4.f;
+	float r,g,b;
+	r = c*(s-0.75f);
+	g = c*(s-0.5f);
+	b = c*(s-0.25f);
+
+	float4 col = make_float4( 1.f-r*r, 1.f-g*g, 1.f-b*b );
+	return col;
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/ConvexHeightFieldShape.h
@@ -0,0 +1,143 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef CONVEX_HEIGHT_FIELD_SHAPE_H
+#define CONVEX_HEIGHT_FIELD_SHAPE_H
+
+#include "Stubs/AdlQuaternion.h"
+#include "Stubs/AdlCollisionShape.h"
+#include "Stubs/AdlAabb.h"
+
+class ShapeBase;
+
+class ConvexHeightField : public CollisionShape
+{
+	public:
+		enum
+		{
+			HEIGHT_RES = 4, //was 4 originally
+		};
+		enum Face
+		{
+			FACE_XM,
+			FACE_XP,
+			FACE_YM,
+			FACE_YP,
+			FACE_ZM,
+			FACE_ZP,
+			NUM_FACES,
+		};
+
+		ConvexHeightField(const float4* vtxBuffer, const int4* idxBuffer, int nTriangles);
+		ConvexHeightField(const ShapeBase* shape);
+		ConvexHeightField(const float4* eqn, int nEqn);
+
+		ConvexHeightField(): CollisionShape( SHAPE_CONVEX_HEIGHT_FIELD ){}
+
+		virtual ~ConvexHeightField();
+
+		//	CollisionShape interface
+		virtual float queryDistance(const float4& p ) const;
+		//	distance is not written to normalOut.w
+		virtual bool queryDistanceWithNormal( const float4& p, float4& normalOut ) const;
+
+		float querySupportHeight(const float4& p ) const;
+
+		//	what is it?
+		float queryW(const float4& p ) const;
+
+		//	others
+		u8& sample(Face face, int x, int y);
+		u8 sample(Face face, int x, int y) const;
+
+		u8& sampleSupport(Face face, int x, int y);
+		u8 sampleSupport(Face face, int x, int y) const;
+
+		float4& sampleNormal(Face face, int x, int y);
+		float4 sampleNormal(Face face, int x, int y) const;
+
+		void calcSamplePoints(float4* points) const;
+		float4 calcSamplePoint(int sIdx) const;
+		const float4* getSamplePoints() const;
+		
+		int getNumSamplePoints() const;
+
+		//void displaySamples(const float4& translation, const Quaternion& quaternion) const;
+
+	private:
+		void create( const float4* vtxBuffer, const int4* idxBuffer, int nTriangles );
+
+	public:
+		u8 m_data[HEIGHT_RES*HEIGHT_RES*6];
+		float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+		float m_scale;
+
+		u8 m_supportHeight[HEIGHT_RES*HEIGHT_RES*6];
+
+		float4 m_samplePoints[HEIGHT_RES*HEIGHT_RES*6];
+		Aabb m_faceAabbs[6];
+};
+
+__inline
+u8& ConvexHeightField::sample(Face face, int x, int y)
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_data[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+u8 ConvexHeightField::sample(Face face, int x, int y) const
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_data[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+u8& ConvexHeightField::sampleSupport(Face face, int x, int y)
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_supportHeight[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+u8 ConvexHeightField::sampleSupport(Face face, int x, int y) const
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_supportHeight[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+float4& ConvexHeightField::sampleNormal(Face face, int x, int y)
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_normal[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+__inline
+float4 ConvexHeightField::sampleNormal(Face face, int x, int y) const
+{
+	ADLASSERT( x < HEIGHT_RES );
+	ADLASSERT( y < HEIGHT_RES );
+	return m_normal[ HEIGHT_RES*HEIGHT_RES*face + x + y*HEIGHT_RES ];
+}
+
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CubeMapUtils.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CubeMapUtils.h
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+//	Coords are 0.5f shifted. See CubeMapDemo.cpp for usage. 
+class CubeMapUtils
+{
+	public:
+		//enum Face
+		//{
+		//	FACE_XM,
+		//	FACE_XP,
+		//	FACE_YM,
+		//	FACE_YP,
+		//	FACE_ZM,
+		//	FACE_ZP,
+		//	NUM_FACES,
+		//};
+
+		__inline
+		static void calcCrd(const float4& p, int& faceIdxOut, float& x, float& y);
+
+		__inline
+		static float4 calcVector(int faceIdx, float x, float y);
+};
+
+
+__inline
+void CubeMapUtils::calcCrd(const float4& p, int& faceIdxOut, float& x, float& y)
+{
+	const float4 majorAxes[] = {make_float4(1,0,0,0), make_float4(0,1,0,0), make_float4(0,0,1,0)};
+
+	float4 majorAxis;
+
+	{
+		int idx;
+		float r2[] = {p.x*p.x, p.y*p.y, p.z*p.z};
+
+		idx = (r2[1]>r2[0])? 1:0;
+		idx = (r2[2]>r2[idx])? 2:idx;
+		majorAxis = majorAxes[idx];
+
+		bool isNeg = dot3F4( p, majorAxis ) < 0.f;
+
+		faceIdxOut = (idx*2+((isNeg)? 0:1));
+//==
+		float4 abs = make_float4( fabs(p.x), fabs(p.y), fabs(p.z), 0.f );
+
+		float d;
+		if( idx == 0 )
+		{
+			x = p.y;
+			y = p.z;
+			d = abs.x;
+		}
+		else if( idx == 1 )
+		{
+			x = p.z;
+			y = p.x;
+			d = abs.y;
+		}
+		else
+		{
+			x = p.x;
+			y = p.y;
+			d = abs.z;
+		}
+
+		float dInv = (d==0.f)? 0.f: (1.f/d);
+		x = (x*dInv+1.f)*0.5f;
+		y = (y*dInv+1.f)*0.5f;
+	}
+}
+
+__inline
+float4 CubeMapUtils::calcVector(int faceIdx, float x, float y)
+{
+	int dir = faceIdx/2;
+	float z = (faceIdx%2 == 0)? -1.f:1.f;
+
+	x = x*2.f-1.f;
+	y = y*2.f-1.f;
+	
+	if( dir == 0 )
+	{
+		return make_float4(z, x, y);
+	}
+	else if( dir == 1 )
+	{
+		return make_float4(y,z,x);
+	}
+	else
+	{
+		return make_float4(x,y,z);
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.cpp
@@ -0,0 +1,699 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "CustomCollisionDispatcher.h"
+#include "BulletCollision/BroadphaseCollision/btCollisionAlgorithm.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "CustomConvexShape.h"
+#include "CustomConvexPairCollision.h"
+#include "LinearMath/btQuickprof.h"
+
+
+
+#ifdef CL_PLATFORM_AMD
+
+#include "Adl/Adl.h"
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlContact4.h"
+#include "Stubs/AdlQuaternion.h"
+#include "Stubs/ChNarrowPhase.h"
+
+#include "Stubs/Solver.h"
+
+
+struct	CustomDispatchData
+{
+	adl::DeviceCL* m_ddcl;
+	adl::Device* m_deviceHost;
+	ShapeDataType m_ShapeBuffer;
+	
+	adl::HostBuffer<int2>* m_pBufPairsCPU;
+	adl::Buffer<int2>* m_pBufPairsGPU;
+	adl::Buffer<Contact4>* m_pBufContactOutGPU;
+	adl::HostBuffer<Contact4>* m_pBufContactOutCPU;
+	adl::ChNarrowphase<adl::TYPE_CL>::Data* m_Data;
+
+	adl::HostBuffer<RigidBodyBase::Body>* m_pBufRBodiesCPU;
+	adl::Buffer<RigidBodyBase::Body>* m_pBufRBodiesGPU;
+
+	adl::Buffer<RigidBodyBase::Shape>*	m_bodyInfoBufferCPU;
+	adl::Buffer<RigidBodyBase::Shape>*	m_bodyInfoBufferGPU;
+
+	adl::Solver<adl::TYPE_CL>::Data* m_solverDataGPU;
+	SolverData		m_contactCGPU;
+	void*			m_frictionCGPU;
+
+	int m_numAcceleratedShapes;
+};
+#endif //CL_PLATFORM_AMD
+
+CustomCollisionDispatcher::CustomCollisionDispatcher(btCollisionConfiguration* collisionConfiguration
+#ifdef CL_PLATFORM_AMD
+		, cl_context context,cl_device_id device,cl_command_queue queue
+#endif //CL_PLATFORM_AMD
+):btCollisionDispatcher(collisionConfiguration),
+m_internalData(0)
+{
+#ifdef CL_PLATFORM_AMD
+
+	if (context && queue)
+	{
+		m_internalData = new CustomDispatchData();
+		memset(m_internalData,0,sizeof(CustomDispatchData));
+
+		adl::DeviceUtils::Config cfg;
+		m_internalData->m_ddcl = new adl::DeviceCL();
+		m_internalData->m_ddcl->m_deviceIdx = device;
+		m_internalData->m_ddcl->m_context = context;
+		m_internalData->m_ddcl->m_commandQueue = queue;
+		m_internalData->m_ddcl->m_kernelManager = new adl::KernelManager;
+
+
+		m_internalData->m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
+		m_internalData->m_pBufPairsCPU = new adl::HostBuffer<int2>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_pBufContactOutCPU = new adl::HostBuffer<Contact4>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_pBufRBodiesCPU = new adl::HostBuffer<RigidBodyBase::Body>(m_internalData->m_deviceHost, MAX_CONVEX_BODIES_CL);
+		
+		m_internalData->m_bodyInfoBufferCPU = new adl::Buffer<RigidBodyBase::Shape>(m_internalData->m_deviceHost,MAX_CONVEX_BODIES_CL);
+		m_internalData->m_pBufContactOutGPU = new adl::Buffer<Contact4>(m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_bodyInfoBufferGPU = new adl::Buffer<RigidBodyBase::Shape>(m_internalData->m_ddcl,MAX_CONVEX_BODIES_CL);
+		m_internalData->m_pBufPairsGPU = new adl::Buffer<int2>(m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate( m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_pBufRBodiesGPU = new adl::Buffer<RigidBodyBase::Body>(m_internalData->m_ddcl, MAX_CONVEX_BODIES_CL);
+		m_internalData->m_Data = adl::ChNarrowphase<adl::TYPE_CL>::allocate(m_internalData->m_ddcl);
+		m_internalData->m_ShapeBuffer = adl::ChNarrowphase<adl::TYPE_CL>::allocateShapeBuffer(m_internalData->m_ddcl, MAX_CONVEX_SHAPES_CL);	
+		m_internalData->m_numAcceleratedShapes = 0;
+
+		m_internalData->m_contactCGPU = adl::Solver<adl::TYPE_CL>::allocateConstraint4( m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_frictionCGPU = adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_internalData->m_ddcl, MAX_BROADPHASE_COLLISION_CL);
+
+	}
+
+
+
+#endif //CL_PLATFORM_AMD
+}
+
+CustomCollisionDispatcher::~CustomCollisionDispatcher(void)
+{
+#ifdef CL_PLATFORM_AMD
+	if (m_internalData)
+	{
+		delete m_internalData->m_pBufPairsCPU;
+		delete m_internalData->m_pBufPairsGPU;
+		delete m_internalData->m_pBufContactOutGPU;
+		delete m_internalData->m_pBufContactOutCPU;
+
+		adl::Solver<adl::TYPE_CL>::deallocateConstraint4( m_internalData->m_contactCGPU );
+		adl::Solver<adl::TYPE_CL>::deallocateFrictionConstraint( m_internalData->m_frictionCGPU );
+
+
+		adl::Solver<adl::TYPE_CL>::deallocate(m_internalData->m_solverDataGPU);
+
+		adl::DeviceUtils::deallocate(m_internalData->m_deviceHost);
+		delete m_internalData->m_ddcl;		
+		delete m_internalData;
+	}
+	
+#endif //CL_PLATFORM_AMD
+
+}
+
+
+#ifdef CL_PLATFORM_AMD
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+
+RigidBodyBase::Shape CreateBodyInfo(const btCollisionObject& colObj)
+{
+	RigidBodyBase::Shape shape;
+	const btRigidBody* bulletBody = btRigidBody::upcast(&colObj);
+	if( colObj.isStaticOrKinematicObject() || !bulletBody)
+	{
+
+		//body.m_quat = qtGetIdentity();
+		//body.m_invMass = 0.f;
+		shape.m_initInvInertia = mtZero();
+		shape.m_invInertia = mtZero();
+	}
+	else
+	{
+
+		btVector3 invLocalInertia = bulletBody->getInvInertiaDiagLocal();
+		shape.m_initInvInertia = mtZero();
+		shape.m_initInvInertia.m_row[0].x = invLocalInertia.x();
+		shape.m_initInvInertia.m_row[1].y = invLocalInertia.y();
+		shape.m_initInvInertia.m_row[2].z = invLocalInertia.z();
+
+		btQuaternion q = colObj.getWorldTransform().getRotation();
+		Quaternion qBody;	
+		qBody.x = q.getX();
+		qBody.y = q.getY();
+		qBody.z = q.getZ();
+		qBody.w = q.getW();
+
+		Matrix3x3 m = qtGetRotationMatrix( qBody);
+		Matrix3x3 mT = mtTranspose( m );
+		shape.m_invInertia = mtMul( mtMul( m, shape.m_initInvInertia ), mT );
+		//bulletBody->getInvInertiaTensorWorld();
+
+
+
+
+	//	shape.m_initInvInertia = mtInvert( localInertia );
+	}
+	return shape;
+}
+
+RigidBodyBase::Body CreateRBodyCL(const btCollisionObject& colObj, int shapeIdx)
+{
+	RigidBodyBase::Body bodyCL;
+
+
+	// position
+	const btVector3& p = colObj.getWorldTransform().getOrigin();
+	bodyCL.m_pos.x = p.getX();
+	bodyCL.m_pos.y = p.getY();
+	bodyCL.m_pos.z = p.getZ();
+	bodyCL.m_pos.w = 0.0f;
+
+	// quaternion
+	btQuaternion q = colObj.getWorldTransform().getRotation();
+	bodyCL.m_quat.x = q.getX();
+	bodyCL.m_quat.y = q.getY();
+	bodyCL.m_quat.z = q.getZ();
+	bodyCL.m_quat.w = q.getW();
+
+	const btRigidBody* bulletBody = btRigidBody::upcast(&colObj);
+	if( colObj.isStaticOrKinematicObject() || !bulletBody)
+	{
+		// linear velocity
+		bodyCL.m_linVel = make_float4(0.0f, 0.0f, 0.0f);
+
+		// angular velocity
+		bodyCL.m_angVel = make_float4(0.0f, 0.0f, 0.0f);
+		bodyCL.m_invMass = 0.f;
+	} else
+	{
+		// linear velocity
+		const btVector3& lv = bulletBody->getLinearVelocity();
+		const btVector3& av = bulletBody->getAngularVelocity();
+
+		bodyCL.m_linVel = make_float4(lv.x(),lv.y(),lv.z(),0.0f);
+		// angular velocity
+		bodyCL.m_angVel = make_float4(av.x(),av.y(),av.z(),0.0f);
+		bodyCL.m_invMass = bulletBody->getInvMass();
+	}
+	// shape index
+	bodyCL.m_shapeIdx = shapeIdx; 
+
+
+	// restituition coefficient
+	bodyCL.m_restituitionCoeff = colObj.getRestitution();
+
+	// friction coefficient
+	bodyCL.m_frictionCoeff = colObj.getFriction();
+
+	return bodyCL;
+}
+#endif //CL_PLATFORM_AMD
+
+void CustomCollisionDispatcher::dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher) 
+{
+	BT_PROFILE("CustomCollisionDispatcher::dispatchAllCollisionPairs");
+	{
+	btBroadphasePairArray& overlappingPairArray = pairCache->getOverlappingPairArray();
+	bool bGPU = (m_internalData != 0);
+#ifdef CL_PLATFORM_AMD
+	if ( !bGPU )
+#endif //CL_PLATFORM_AMD
+	{
+		BT_PROFILE("btCollisionDispatcher::dispatchAllCollisionPairs");
+		btCollisionDispatcher::dispatchAllCollisionPairs(pairCache,dispatchInfo,dispatcher);
+	}
+#ifdef CL_PLATFORM_AMD
+
+	else
+	{
+		{
+			BT_PROFILE("refreshContactPoints");
+			//----------------------------------------------------------------
+			// GPU version of convex heightmap narrowphase collision detection
+			//----------------------------------------------------------------
+			for ( int i = 0; i < getNumManifolds(); i++ )
+			{
+				btPersistentManifold* manifold = getManifoldByIndexInternal(i);
+
+
+				btCollisionObject* body0 = (btCollisionObject*)manifold->getBody0();
+				btCollisionObject* body1 = (btCollisionObject*)manifold->getBody1();
+
+				manifold->refreshContactPoints(body0->getWorldTransform(),body1->getWorldTransform());
+			}
+		}
+
+		// OpenCL 
+		int nColPairsFromBP = overlappingPairArray.size();
+		btAssert(MAX_BROADPHASE_COLLISION_CL >= nColPairsFromBP);
+
+		int maxBodyIndex = -1;
+
+		{
+			BT_PROFILE("CreateRBodyCL and GPU pairs");
+			for ( int i=0; i<overlappingPairArray.size(); i++)
+			{
+				btAssert(i<MAX_BROADPHASE_COLLISION_CL);
+
+				btBroadphasePair* pair = &overlappingPairArray[i];
+
+				btCollisionObject* colObj0 = (btCollisionObject*)pair->m_pProxy0->m_clientObject;
+				btCollisionObject* colObj1 = (btCollisionObject*)pair->m_pProxy1->m_clientObject;
+
+				int bodyIndex0 = colObj0->getCompanionId();
+				int bodyIndex1 = colObj1->getCompanionId();
+
+				//keep a one-to-one mapping between Bullet and Adl broadphase pairs
+				(*m_internalData->m_pBufPairsCPU)[i].x = bodyIndex0;
+				(*m_internalData->m_pBufPairsCPU)[i].y = bodyIndex1;
+
+				if (bodyIndex0>=0 && bodyIndex1>=0)
+				{
+					//create companion shapes (if necessary)
+
+					btAssert(colObj0->getCollisionShape()->getShapeType() == CUSTOM_POLYHEDRAL_SHAPE_TYPE);
+					btAssert(colObj1->getCollisionShape()->getShapeType() == CUSTOM_POLYHEDRAL_SHAPE_TYPE);
+
+					CustomConvexShape* convexShape0 = (CustomConvexShape*)colObj0->getCollisionShape();
+					CustomConvexShape* convexShape1 = (CustomConvexShape*)colObj1->getCollisionShape();
+
+					if (convexShape0->m_acceleratedCompanionShapeIndex<0)
+					{
+						convexShape0->m_acceleratedCompanionShapeIndex = m_internalData->m_numAcceleratedShapes;
+						adl::ChNarrowphase<adl::TYPE_CL>::setShape(m_internalData->m_ShapeBuffer, convexShape0->m_ConvexHeightField, convexShape0->m_acceleratedCompanionShapeIndex, 0.0f);
+						m_internalData->m_numAcceleratedShapes++;
+					}
+					if (convexShape1->m_acceleratedCompanionShapeIndex<0)
+					{
+						convexShape1->m_acceleratedCompanionShapeIndex = m_internalData->m_numAcceleratedShapes;
+						adl::ChNarrowphase<adl::TYPE_CL>::setShape(m_internalData->m_ShapeBuffer, convexShape1->m_ConvexHeightField, convexShape1->m_acceleratedCompanionShapeIndex, 0.0f);
+						m_internalData->m_numAcceleratedShapes++;
+					}
+
+					btAssert(m_internalData->m_numAcceleratedShapes<MAX_CONVEX_SHAPES_CL);
+
+					if (bodyIndex0>maxBodyIndex)
+						maxBodyIndex = bodyIndex0;
+					if (bodyIndex1>maxBodyIndex)
+						maxBodyIndex = bodyIndex1;
+
+					btAssert(maxBodyIndex<MAX_CONVEX_BODIES_CL);
+					if (maxBodyIndex>=MAX_CONVEX_BODIES_CL)
+					{
+						printf("error: maxBodyIndex(%d)>MAX_CONVEX_BODIES_CL(%d)\n",maxBodyIndex,MAX_CONVEX_BODIES_CL);
+					}
+
+					(*m_internalData->m_pBufRBodiesCPU)[bodyIndex0] = CreateRBodyCL(*colObj0, convexShape0->m_acceleratedCompanionShapeIndex);
+					m_internalData->m_bodyInfoBufferCPU->m_ptr[bodyIndex0] = CreateBodyInfo(*colObj0);
+					(*m_internalData->m_pBufRBodiesCPU)[bodyIndex1] = CreateRBodyCL(*colObj1, convexShape0->m_acceleratedCompanionShapeIndex);
+					m_internalData->m_bodyInfoBufferCPU->m_ptr[bodyIndex1] = CreateBodyInfo(*colObj1);
+				} else
+				{
+					//TODO: dispatch using default dispatcher
+					btAssert(0);
+				}
+			}
+		}
+
+
+		if (maxBodyIndex>=0)
+		{
+			
+			int numOfConvexRBodies = maxBodyIndex+1;
+
+			
+
+			adl::ChNarrowphaseBase::Config cfgNP;
+			cfgNP.m_collisionMargin = 0.01f;
+			int nContactOut = 0;
+
+			{
+				BT_PROFILE("ChNarrowphase::execute");
+				adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, m_internalData->m_pBufPairsGPU, nColPairsFromBP, m_internalData->m_pBufRBodiesGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
+				adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+			}
+
+
+			bool useCpu = false;//true;
+			bool useSolver = true;//true;//false;
+			
+			if (useSolver)
+			{
+				float dt=1./60.;
+				adl::SolverBase::ConstraintCfg csCfg( dt );
+				csCfg.m_enableParallelSolve = true;
+				csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent;
+				csCfg.m_staticIdx = -1;//numOfConvexRBodies-1;//m_nBodies-1;
+
+			
+			if (useCpu)
+			{
+
+				{
+					BT_PROFILE("read m_pBufContactOutGPU");
+					m_internalData->m_pBufContactOutGPU->read(m_internalData->m_pBufContactOutCPU->m_ptr, nContactOut);//MAX_BROADPHASE_COLLISION_CL);
+					adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+				}
+
+				BT_PROFILE("CPU stuff");
+				adl::Solver<adl::TYPE_HOST>::Data* solverData = adl::Solver<adl::TYPE_HOST>::allocate( m_internalData->m_deviceHost, nContactOut);
+
+				SolverData contactCPU = adl::Solver<adl::TYPE_HOST>::allocateConstraint4( 
+					m_internalData->m_deviceHost, 
+					numOfConvexRBodies*MAX_PAIRS_PER_BODY_CL );
+
+				void* frictionCPU = adl::Solver<adl::TYPE_HOST>::allocateFrictionConstraint( 
+					m_internalData->m_deviceHost, 
+					numOfConvexRBodies*MAX_PAIRS_PER_BODY_CL );
+
+				//write body with current linear/angluar velocities to GPU
+				m_internalData->m_bodyInfoBufferGPU->write(m_internalData->m_bodyInfoBufferCPU->m_ptr,numOfConvexRBodies);
+				adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+
+
+				if (nContactOut)
+				{
+					reorderConvertToConstraints2( 
+						solverData, 
+						m_internalData->m_pBufRBodiesCPU, 
+						m_internalData->m_bodyInfoBufferCPU, 
+						m_internalData->m_pBufContactOutCPU,
+						contactCPU, 
+						frictionCPU, 
+						nContactOut, 
+						csCfg );
+
+					bool forceGPU = true;
+
+					if (forceGPU)
+					{
+
+						SolverData contactCPUcopy = adl::Solver<adl::TYPE_HOST>::allocateConstraint4( 
+							m_internalData->m_deviceHost, 
+							numOfConvexRBodies*MAX_PAIRS_PER_BODY_CL );
+
+							adl::Solver<adl::TYPE_CL>::reorderConvertToConstraints( 
+						m_internalData->m_solverDataGPU, 
+						m_internalData->m_pBufRBodiesGPU, 
+						m_internalData->m_bodyInfoBufferGPU, 
+						m_internalData->m_pBufContactOutGPU,
+						m_internalData->m_contactCGPU, 
+						m_internalData->m_frictionCGPU, 
+						nContactOut, 
+						csCfg );
+
+						adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+						m_internalData->m_contactCGPU->read(contactCPUcopy->m_ptr,nContactOut);
+						adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+
+						
+						//m_internalData->m_contactCGPU->write(contactCPU->m_ptr,nContactOut);
+						adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+						m_internalData->m_solverDataGPU->m_nIterations = 4;
+					
+						adl::Solver<adl::TYPE_CL>::solveContactConstraint( m_internalData->m_solverDataGPU, 
+							m_internalData->m_pBufRBodiesGPU, 
+							m_internalData->m_bodyInfoBufferGPU, 
+							m_internalData->m_contactCGPU,
+							0, 
+							nContactOut );
+
+							adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+
+						//read body updated linear/angular velocities back to CPU
+						m_internalData->m_pBufRBodiesGPU->read(
+							m_internalData->m_pBufRBodiesCPU->m_ptr,numOfConvexRBodies);
+							adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+
+					} else
+					{
+					solverData->m_nIterations = 4;
+					adl::Solver<adl::TYPE_HOST>::solveContactConstraint( solverData, 
+						m_internalData->m_pBufRBodiesCPU, 
+						m_internalData->m_bodyInfoBufferCPU, 
+						contactCPU,
+						0, 
+						nContactOut );
+					}
+
+
+
+					}
+
+				adl::Solver<adl::TYPE_HOST>::deallocateConstraint4( contactCPU );
+				adl::Solver<adl::TYPE_HOST>::deallocateFrictionConstraint( frictionCPU );
+				adl::Solver<adl::TYPE_HOST>::deallocate( solverData );
+
+				
+
+			}
+			else
+			{
+				
+				{
+					BT_PROFILE("rigid body data to GPU buffer");
+					// Transfer rigid body data from CPU buffer to GPU buffer
+					m_internalData->m_pBufRBodiesGPU->write(m_internalData->m_pBufRBodiesCPU->m_ptr, numOfConvexRBodies);
+					m_internalData->m_pBufPairsGPU->write(m_internalData->m_pBufPairsCPU->m_ptr, MAX_BROADPHASE_COLLISION_CL);
+					//write body with current linear/angluar velocities to GPU
+					m_internalData->m_bodyInfoBufferGPU->write(m_internalData->m_bodyInfoBufferCPU->m_ptr,numOfConvexRBodies);
+					adl::DeviceUtils::waitForCompletion(m_internalData->m_ddcl);
+				}
+				{
+					BT_PROFILE("GPU reorderConvertToConstraints");
+					adl::Solver<adl::TYPE_CL>::reorderConvertToConstraints( 
+						m_internalData->m_solverDataGPU, 
+						m_internalData->m_pBufRBodiesGPU, 
+						m_internalData->m_bodyInfoBufferGPU, 
+						m_internalData->m_pBufContactOutGPU,
+						m_internalData->m_contactCGPU, 
+						m_internalData->m_frictionCGPU, 
+						nContactOut, 
+						csCfg );
+				}
+
+				{
+					BT_PROFILE("GPU solveContactConstraint");
+				m_internalData->m_solverDataGPU->m_nIterations = 4;
+					
+					adl::Solver<adl::TYPE_CL>::solveContactConstraint( m_internalData->m_solverDataGPU, 
+						m_internalData->m_pBufRBodiesGPU, 
+						m_internalData->m_bodyInfoBufferGPU, 
+						m_internalData->m_contactCGPU,
+						0, 
+						nContactOut );
+	
+					adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+				}
+				{
+					BT_PROFILE("read body velocities back to CPU");
+					//read body updated linear/angular velocities back to CPU
+					m_internalData->m_pBufRBodiesGPU->read(
+						m_internalData->m_pBufRBodiesCPU->m_ptr,numOfConvexRBodies);
+						adl::DeviceUtils::waitForCompletion( m_internalData->m_ddcl );
+				}
+
+				
+			}
+
+#if 0
+				if( !m_useGPUPipeline )
+				{	//	CPU
+						BT_PROFILE("CPU solve");
+						{
+							BT_PROFILE("CPU reorderConvertToConstraints");
+
+					SOLVER_CLASS<TYPE_HOST>::reorderConvertToConstraints( solver, m_bodyBuffer, m_bodyInfoBufferCPU, (Buffer<Contact4>*)m_contactBuffer, 
+						contactC, frictionC, m_numContacts, csCfg );
+						}
+						{
+							BT_PROFILE("CPU solveContactConstraint");
+
+					solver->m_nIterations = 4;
+					SOLVER_CLASS<TYPE_HOST>::solveContactConstraint( solver, m_bodyBuffer, m_bodyInfoBufferCPU, contactC, 0, m_numContacts );
+						}
+				}
+				else
+				{
+						BT_PROFILE("GPU solve");
+					{	//	GPU using host buffers
+						{
+							BT_PROFILE("GPU reorderConvertToConstraints");
+
+						Solver<TYPE_CL>::reorderConvertToConstraints( m_solver, m_bodyBuffer, m_bodyInfoBufferCPU, (Buffer<Contact4>*)m_contactBuffer, 
+							contactC, frictionC, m_numContacts, csCfg );
+						}
+						timerEnd();
+
+						timerStart(0);
+						//for(int iter=0; iter<4; iter++)
+						{
+							BT_PROFILE("GPU solveContactConstraint");
+
+							Solver<TYPE_CL>::solveContactConstraint( m_solver, m_bodyBuffer, m_bodyInfoBufferCPU, contactC, frictionC, m_numContacts );
+						}
+						DeviceUtils::waitForCompletion( m_device );
+					}
+				}
+				timerEnd();
+#endif
+
+
+			}
+
+			//if we ran the solver, it will overwrite the batchIdx so we cannot write back the results
+			//try to make it work by writing velocity back to rigid body
+
+			if (useSolver)
+			{
+				
+				BT_PROFILE("writing velocity back to btRigidBody");
+
+				for ( int i=0; i<overlappingPairArray.size(); i++)
+				{
+					btAssert(i<MAX_BROADPHASE_COLLISION_CL);
+
+					btBroadphasePair* pair = &overlappingPairArray[i];
+
+					btCollisionObject* colObj0 = (btCollisionObject*)pair->m_pProxy0->m_clientObject;
+					btCollisionObject* colObj1 = (btCollisionObject*)pair->m_pProxy1->m_clientObject;
+
+					int bodyIndex0 = colObj0->getCompanionId();
+					int bodyIndex1 = colObj1->getCompanionId();
+
+					RigidBodyBase::Body* bA = &m_internalData->m_pBufRBodiesCPU->m_ptr[bodyIndex0];
+					RigidBodyBase::Body* bB = &m_internalData->m_pBufRBodiesCPU->m_ptr[bodyIndex1];
+					btRigidBody* bodyA = btRigidBody::upcast(colObj0);
+					if (bodyA && !bodyA->isStaticOrKinematicObject())
+					{
+						bodyA->setLinearVelocity(btVector3(
+										bA->m_linVel.x,
+										bA->m_linVel.y,
+										bA->m_linVel.z));
+
+						bodyA->setAngularVelocity(btVector3(
+										bA->m_angVel.x,
+										bA->m_angVel.y,
+										bA->m_angVel.z));
+					}
+					btRigidBody* bodyB = btRigidBody::upcast(colObj1);
+					if (bodyB && !bodyB->isStaticOrKinematicObject())
+					{
+						bodyB->setLinearVelocity(btVector3(
+							bB->m_linVel.x,
+							bB->m_linVel.y,
+							bB->m_linVel.z));
+						bodyB->setAngularVelocity(btVector3(
+										bB->m_angVel.x,
+										bB->m_angVel.y,
+										bB->m_angVel.z));
+
+					}
+
+
+
+
+				}
+			} else
+			{
+				BT_PROFILE("copy Contact4 to btPersistentManifold");
+				// Now we got the narrowphase info from GPU and need to update rigid bodies with the info and go back to the original pipeline in Bullet physics. 
+				for ( int i = 0; i < nContactOut; i++ )
+				{
+					Contact4 contact = (*m_internalData->m_pBufContactOutCPU)[i];
+
+					int idxBodyA = contact.m_bodyAPtr;
+					int idxBodyB = contact.m_bodyBPtr;
+
+					btAssert(contact.m_batchIdx>=0);
+					btAssert(contact.m_batchIdx<overlappingPairArray.size());
+
+					btBroadphasePair* pair = &overlappingPairArray[contact.m_batchIdx];
+
+					btCollisionObject* colObj0 = (btCollisionObject*)pair->m_pProxy0->m_clientObject;
+					btCollisionObject* colObj1 = (btCollisionObject*)pair->m_pProxy1->m_clientObject;
+
+					if (!pair->m_algorithm)
+					{
+						pair->m_algorithm = findAlgorithm(colObj0,colObj1,0);
+					}
+
+					btManifoldResult contactPointResult(colObj0, colObj1);
+
+
+					CustomConvexConvexPairCollision* pairAlgo = (CustomConvexConvexPairCollision*) pair->m_algorithm;
+
+					if (!pairAlgo->getManifoldPtr())
+					{
+						pairAlgo->createManifoldPtr(colObj0,colObj1,dispatchInfo);
+					}
+					
+					contactPointResult.setPersistentManifold(pairAlgo->getManifoldPtr());
+					
+					contactPointResult.getPersistentManifold()->refreshContactPoints(colObj0->getWorldTransform(),colObj1->getWorldTransform());
+
+					const btTransform& transA = colObj0->getWorldTransform();
+					const btTransform& transB = colObj1->getWorldTransform();
+
+					int numPoints = contact.getNPoints();
+
+					for ( int k=0; k < numPoints; k++ )
+					{
+						btVector3 normalOnBInWorld(
+							contact.m_worldNormal.x,
+							contact.m_worldNormal.y,
+							contact.m_worldNormal.z);
+						btVector3 pointInWorldOnB(
+							contact.m_worldPos[k].x,
+							contact.m_worldPos[k].y,
+							contact.m_worldPos[k].z);
+
+						btScalar depth = contact.m_worldPos[k].w;
+
+						if (depth<0)
+						{
+							const btVector3 deltaC = transB.getOrigin() - transA.getOrigin();
+
+							normalOnBInWorld.normalize();
+
+							if((deltaC.dot(normalOnBInWorld))>0.0f)
+							{
+								normalOnBInWorld= -normalOnBInWorld;
+
+								contactPointResult.addContactPoint(normalOnBInWorld, pointInWorldOnB, depth);
+							}
+							else
+							{
+								contactPointResult.addContactPoint(normalOnBInWorld, pointInWorldOnB-normalOnBInWorld*depth, depth);
+							}
+						}
+					}
+				}
+			}
+		}
+	}
+#endif //CL_PLATFORM_AMD
+	}
+
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomCollisionDispatcher.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef CUSTOM_COLLISION_DISPATCHER_H
+#define CUSTOM_COLLISION_DISPATCHER_H
+
+
+#include "BulletCollision/CollisionDispatch/btCollisionDispatcher.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+
+
+#define MAX_CONVEX_BODIES_CL 64*1024
+#define MAX_PAIRS_PER_BODY_CL 32
+#define MAX_CONVEX_SHAPES_CL 8192
+#define MAX_BROADPHASE_COLLISION_CL (MAX_CONVEX_BODIES_CL*MAX_PAIRS_PER_BODY_CL)
+
+
+
+struct	CustomDispatchData;
+
+#ifdef CL_PLATFORM_AMD
+#ifdef __APPLE__
+	#ifdef USE_MINICL
+		#include <MiniCL/cl.h>
+	#else
+		#include <OpenCL/cl.h>
+	#endif
+#else //__APPLE__
+	#ifdef USE_MINICL
+		#include <MiniCL/cl.h>
+	#else
+		#include <CL/cl.h>
+	#endif
+#endif //__APPLE__
+#endif
+
+class CustomCollisionDispatcher : public btCollisionDispatcher
+{
+public:
+	CustomCollisionDispatcher (btCollisionConfiguration* collisionConfiguration
+#ifdef CL_PLATFORM_AMD
+		, cl_context context = NULL,cl_device_id device = NULL,cl_command_queue queue = NULL
+#endif //CL_PLATFORM_AMD
+		);
+
+	virtual ~CustomCollisionDispatcher(void);
+
+protected:
+
+	CustomDispatchData*	m_internalData;
+
+	btBroadphasePair* GetPair(btBroadphasePairArray& pairArray, int idxBodyA, int idxBodyB);
+
+public:
+	virtual void dispatchAllCollisionPairs(btOverlappingPairCache* pairCache,const btDispatcherInfo& dispatchInfo,btDispatcher* dispatcher);
+};
+
+#endif //CUSTOM_COLLISION_DISPATCHER_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.cpp
@@ -0,0 +1,409 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "CustomConvexPairCollision.h"
+#include "ConvexHeightFieldShape.h"
+#include "CustomConvexShape.h"
+#include "BulletCollision/CollisionDispatch/btCollisionObject.h"
+#include "Stubs/AdlContact4.h"
+#include "Stubs/AdlTransform.h"
+
+
+CustomConvexConvexPairCollision::CustomConvexConvexPairCollision(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold)
+:btConvexConvexAlgorithm(mf,ci,body0,body1,simplexSolver,pdSolver,numPerturbationIterations, minimumPointsPerturbationThreshold)
+{
+
+}
+
+CustomConvexConvexPairCollision::~CustomConvexConvexPairCollision()
+{
+
+}
+
+
+#include <Windows.h>
+
+template<typename T>
+T atomAdd(const T* ptr, int value)
+{
+	return (T)InterlockedExchangeAdd((LONG*)ptr, value);
+}
+
+
+
+#define PARALLEL_SUM(v, n) for(int j=1; j<n; j++) v[0] += v[j];
+#define PARALLEL_DO(execution, n) for(int ie=0; ie<n; ie++){execution;}
+#define REDUCE_MAX(v, n) {int i=0;\
+	for(int offset=0; offset<n; offset++) v[i] = (v[i].y > v[i+offset].y)? v[i]: v[i+offset]; }
+#define REDUCE_MIN(v, n) {int i=0;\
+	for(int offset=0; offset<n; offset++) v[i] = (v[i].y < v[i+offset].y)? v[i]: v[i+offset]; }
+
+int extractManifold(const float4* p, int nPoints, float4& nearNormal, float4& centerOut, 
+					 int contactIdx[4])
+{
+	if( nPoints == 0 ) return 0;
+
+	nPoints = min2( nPoints, 64 );
+
+	float4 center = make_float4(0.f);
+	{
+		float4 v[64];
+		memcpy( v, p, nPoints*sizeof(float4) );
+		PARALLEL_SUM( v, nPoints );
+		center = v[0]/(float)nPoints;
+	}
+
+	centerOut = center;
+
+	{	//	sample 4 directions
+		if( nPoints < 4 )
+		{
+			for(int i=0; i<nPoints; i++) contactIdx[i] = i;
+			return nPoints;
+		}
+
+		float4 aVector = p[0] - center;
+		float4 u = cross3( nearNormal, aVector );
+		float4 v = cross3( nearNormal, u );
+		u = normalize3( u );
+		v = normalize3( v );
+
+		int idx[4];
+
+		float2 max00 = make_float2(0,FLT_MAX);
+		{
+			float4 dir0 = u;
+			float4 dir1 = -u;
+			float4 dir2 = v;
+			float4 dir3 = -v;
+
+			//	idx, distance
+			{
+				{
+					int4 a[64];
+					for(int ie = 0; ie<nPoints; ie++ )
+					{
+						float4 f;
+						float4 r = p[ie]-center;
+						f.x = dot3F4( dir0, r );
+						f.y = dot3F4( dir1, r );
+						f.z = dot3F4( dir2, r );
+						f.w = dot3F4( dir3, r );
+
+						a[ie].x = ((*(u32*)&f.x) & 0xffffff00);
+						a[ie].x |= (0xff & ie);
+
+						a[ie].y = ((*(u32*)&f.y) & 0xffffff00);
+						a[ie].y |= (0xff & ie);
+
+						a[ie].z = ((*(u32*)&f.z) & 0xffffff00);
+						a[ie].z |= (0xff & ie);
+
+						a[ie].w = ((*(u32*)&f.w) & 0xffffff00);
+						a[ie].w |= (0xff & ie);
+					}
+
+					for(int ie=0; ie<nPoints; ie++)
+					{
+						a[0].x = (a[0].x > a[ie].x )? a[0].x: a[ie].x;
+						a[0].y = (a[0].y > a[ie].y )? a[0].y: a[ie].y;
+						a[0].z = (a[0].z > a[ie].z )? a[0].z: a[ie].z;
+						a[0].w = (a[0].w > a[ie].w )? a[0].w: a[ie].w;
+					}
+
+					idx[0] = (int)a[0].x & 0xff;
+					idx[1] = (int)a[0].y & 0xff;
+					idx[2] = (int)a[0].z & 0xff;
+					idx[3] = (int)a[0].w & 0xff;
+				}
+			}
+
+			{
+				float2 h[64];
+				PARALLEL_DO( h[ie] = make_float2((float)ie, p[ie].w), nPoints );
+				REDUCE_MIN( h, nPoints );
+				max00 = h[0];
+			}
+		}
+
+		contactIdx[0] = idx[0];
+		contactIdx[1] = idx[1];
+		contactIdx[2] = idx[2];
+		contactIdx[3] = idx[3];
+
+//		if( max00.y < 0.0f )
+//			contactIdx[0] = (int)max00.x;
+
+		std::sort( contactIdx, contactIdx+4 );
+
+		return 4;
+	}
+}
+
+#undef PARALLEL_SUM
+#undef PARALLEL_DO
+#undef REDUCE_MAX
+#undef REDUCE_MIX
+
+int collideStraight(const ConvexHeightField* shapeA,const ConvexHeightField* shapeB,
+		const float4& bodyApos, Quaternion& bodyAquat,const float4& bodyBpos,const Quaternion& bodyBquat,
+		ContactPoint4* contactsOut, int& numContacts, int contactCapacity,
+		float collisionMargin )
+{
+//	Stopwatch sw;
+
+	Transform trA;
+	trA = trSetTransform(bodyApos,bodyAquat);
+	Transform trB;
+	trB = trSetTransform(bodyBpos, bodyBquat);
+	
+	Transform B2A;
+	{
+		Transform invTrA = trInvert( trA );
+		B2A = trMul( invTrA, trB );
+	}
+
+	int nContacts = 0;
+	{	// testB against A
+		float4 p[ConvexHeightField::HEIGHT_RES*ConvexHeightField::HEIGHT_RES*6];
+		int nHits = 0;
+
+		const float4* pInB = shapeB->getSamplePoints();
+
+		float4 baInB = qtInvRotate( bodyBquat, bodyApos - bodyBpos );
+		if( shapeA->m_type == CollisionShape::SHAPE_HEIGHT_FIELD ) 
+			baInB = make_float4(0,0,0,0);
+
+//		sw.start();
+		for(int iface=0; iface<6; iface++)
+		{
+			Aabb aabb = shapeB->m_faceAabbs[iface];
+
+			aabb.transform( B2A.m_translation, B2A.m_rotation );
+
+			if( !shapeA->m_aabb.overlaps( aabb ) ) continue;
+			
+			for(int ip=0; ip<ConvexHeightField::HEIGHT_RES*ConvexHeightField::HEIGHT_RES; ip++)
+			{
+				int i = iface*ConvexHeightField::HEIGHT_RES*ConvexHeightField::HEIGHT_RES+ip;
+
+				if( dot3F4( baInB, pInB[i] ) < 0.f ) continue;
+
+				float4 pInA = trMul1( B2A, pInB[i] );
+
+				if( shapeA->m_aabb.overlaps( pInA ) )
+				{
+//					Stopwatch sw1;
+//					sw1.start();
+					float dist = shapeA->queryDistance( pInA );
+//					sw1.stop();
+//					m_times[TIME_SAMPLE] += sw1.getMs();
+
+					if( dist < collisionMargin )
+					{
+						p[nHits] = make_float4(pInA.x, pInA.y, pInA.z, dist);
+						nHits++;
+					}
+				}
+			}
+		}
+//		sw.stop();
+//		m_times[TIME_TEST] += sw.getMs();
+
+//		sw.start();
+		if( nHits )
+		{
+			float4 ab = bodyBpos - bodyApos;
+			ab = qtInvRotate( bodyAquat, ab );
+			if( shapeA->m_type == CollisionShape::SHAPE_HEIGHT_FIELD )
+			{
+				//todo.	sample normal from height field but just fake here
+				ab = make_float4(0,1,0,0);
+			}
+
+			int cIdx[4];
+			float4 center;
+			
+			nContacts = extractManifold( p, nHits, ab, center, cIdx );
+
+			float4 contactNormal;
+			{
+				shapeA->queryDistanceWithNormal( center, contactNormal );
+				contactNormal = normalize3( contactNormal );
+
+//				u32 cmp = u8vCompress( contactNormal );
+//				contactNormal = make_float4( u8vGetX(cmp), u8vGetY(cmp), u8vGetZ(cmp), 0 );
+			}
+
+			int writeIdx = atomAdd( &numContacts, 1 );
+			if( writeIdx+1 < contactCapacity )
+			{
+				ContactPoint4& c = contactsOut[writeIdx];
+				nContacts = min2( nContacts, 4 );
+				for(int i=0; i<nContacts; i++)
+				{
+					c.m_worldPos[i] = transform( p[cIdx[i]], bodyApos, bodyAquat );
+					c.m_worldPos[i].w = max2( p[cIdx[i]].w - collisionMargin, -2*collisionMargin );
+				}
+				c.m_worldNormal = normalize3( qtRotate( bodyAquat, contactNormal ) );
+				c.m_restituitionCoeff = 0.f;
+				c.m_frictionCoeff = 0.7f;
+				//c.m_bodyAPtr = (void*)bodyAIdx;
+				//c.m_bodyBPtr = (void*)bodyBIdx;
+				c.getNPoints() = nContacts;
+			}
+		}
+//		sw.stop();
+//		m_times[TIME_MANIFOLD] += sw.getMs();
+	}
+
+	return nContacts;
+}
+
+
+void	CustomConvexConvexPairCollision::createManifoldPtr(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo)
+{
+	m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+	m_ownManifold = true;
+}
+
+	
+void CustomConvexConvexPairCollision::processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut)
+{
+#if 1
+	if (!m_manifoldPtr)
+	{
+		//swapped?
+		m_manifoldPtr = m_dispatcher->getNewManifold(body0,body1);
+		m_ownManifold = true;
+	}
+	resultOut->setPersistentManifold(m_manifoldPtr);
+
+
+	CustomConvexShape* convex0 = (CustomConvexShape*)body0->getCollisionShape();
+	CustomConvexShape* convex1 = (CustomConvexShape*)body1->getCollisionShape();
+
+	
+	float4 bodyApos;
+	float4 bodyBpos;
+	Quaternion bodyAquat;
+	Quaternion bodyBquat;
+
+	const btTransform& transA = body0->getWorldTransform();
+	const btTransform& transB = body1->getWorldTransform();
+
+	const btVector3& pA = body0->getWorldTransform().getOrigin();
+	const btVector3& pB = body1->getWorldTransform().getOrigin();
+
+	btQuaternion qA = body0->getWorldTransform().getRotation();
+	btQuaternion qB = body1->getWorldTransform().getRotation();
+
+	bodyApos.x = pA.getX();
+	bodyApos.y = pA.getY();
+	bodyApos.z = pA.getZ();
+	bodyApos.w = 0.f;
+	
+	bodyBpos.x = pB.getX();
+	bodyBpos.y = pB.getY();
+	bodyBpos.z = pB.getZ();
+	bodyBpos.w = 0.f;
+	
+	bodyAquat.x = qA.getX();
+	bodyAquat.y = qA.getY();
+	bodyAquat.z = qA.getZ();
+	bodyAquat.w = qA.getW();
+
+	bodyBquat.x = qB.getX();
+	bodyBquat.y = qB.getY();
+	bodyBquat.z = qB.getZ();
+	bodyBquat.w = qB.getW();
+
+
+#define CAPACITY_CONTACTS 4
+
+	ContactPoint4 contactsOut[CAPACITY_CONTACTS];
+	int freeContactIndex = 0;
+	int contactCapacity = CAPACITY_CONTACTS;
+	float collisionMargin = 0.001f;
+
+	m_manifoldPtr->refreshContactPoints(body0->getWorldTransform(),body1->getWorldTransform());
+
+	collideStraight(convex0->m_ConvexHeightField,convex1->m_ConvexHeightField,
+		bodyApos, bodyAquat,bodyBpos,bodyBquat,
+		contactsOut, freeContactIndex, contactCapacity,
+		collisionMargin );
+	collideStraight(convex1->m_ConvexHeightField,convex0->m_ConvexHeightField,
+		bodyBpos, bodyBquat,bodyApos,bodyAquat,
+		contactsOut, freeContactIndex, contactCapacity,
+		collisionMargin );
+
+	//copy points into manifold
+	//refresh manifold
+
+	btAssert(freeContactIndex<3);
+	for (int j=0;j<freeContactIndex;j++)
+	{
+		int numPoints = contactsOut[j].getNPoints();
+//		printf("numPoints = %d\n",numPoints);
+
+		for (int i=0;i<numPoints;i++)
+		{
+
+			ContactPoint4& c = contactsOut[j];
+
+			btVector3 normalOnBInWorld(
+				c.m_worldNormal.x,
+				c.m_worldNormal.y,
+				c.m_worldNormal.z);
+			btVector3 pointInWorldOnB(
+				c.m_worldPos[i].x,
+				c.m_worldPos[i].y,
+				c.m_worldPos[i].z);
+			btScalar depth = c.m_worldPos[i].w;
+			if (depth<0)
+			{
+
+				const btVector3 deltaC = transB.getOrigin() - transA.getOrigin();
+				if((deltaC.dot(normalOnBInWorld))>0.0f)
+				{
+					normalOnBInWorld= -normalOnBInWorld;
+				}
+				normalOnBInWorld.normalize();
+				if (j)
+				{
+					resultOut->addContactPoint(normalOnBInWorld, pointInWorldOnB, depth);
+				} else
+				{
+					resultOut->addContactPoint(normalOnBInWorld, pointInWorldOnB-normalOnBInWorld*depth, depth);
+				}
+			}
+		}
+	}
+#else
+	btConvexConvexAlgorithm::processCollision(body0,body1,dispatchInfo,resultOut);
+#endif
+}
+
+
+
+CustomConvexConvexPairCollision::CreateFunc::CreateFunc(btSimplexSolverInterface*			simplexSolver, btConvexPenetrationDepthSolver* pdSolver)
+:btConvexConvexAlgorithm::CreateFunc(simplexSolver,pdSolver)
+{
+}
+		
+CustomConvexConvexPairCollision::CreateFunc::~CreateFunc()
+{
+
+}
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexPairCollision.h
@@ -0,0 +1,56 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef CUSTOM_CONVEX_CONVEX_PAIR_COLLISION_H
+#define CUSTOM_CONVEX_CONVEX_PAIR_COLLISION_H
+
+
+#include "BulletCollision/CollisionDispatch/btConvexConvexAlgorithm.h"
+
+class CustomConvexConvexPairCollision : public btConvexConvexAlgorithm
+{
+	public:
+
+	CustomConvexConvexPairCollision(btPersistentManifold* mf,const btCollisionAlgorithmConstructionInfo& ci,btCollisionObject* body0,btCollisionObject* body1, btSimplexSolverInterface* simplexSolver, btConvexPenetrationDepthSolver* pdSolver, int numPerturbationIterations, int minimumPointsPerturbationThreshold);
+	virtual ~CustomConvexConvexPairCollision();
+
+	virtual void processCollision (btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo,btManifoldResult* resultOut);
+
+	btPersistentManifold*	getManifoldPtr()
+	{
+		return m_manifoldPtr;
+	}
+
+	void	createManifoldPtr(btCollisionObject* body0,btCollisionObject* body1,const btDispatcherInfo& dispatchInfo);
+
+	struct CreateFunc :public 	btConvexConvexAlgorithm::CreateFunc
+	{
+
+		CreateFunc(btSimplexSolverInterface*			simplexSolver, btConvexPenetrationDepthSolver* pdSolver);
+		
+		virtual ~CreateFunc();
+
+		virtual	btCollisionAlgorithm* CreateCollisionAlgorithm(btCollisionAlgorithmConstructionInfo& ci, btCollisionObject* body0,btCollisionObject* body1)
+		{
+			void* mem = ci.m_dispatcher1->allocateCollisionAlgorithm(sizeof(CustomConvexConvexPairCollision));
+			return new(mem) CustomConvexConvexPairCollision(ci.m_manifold,ci,body0,body1,m_simplexSolver,m_pdSolver,m_numPerturbationIterations,m_minimumPointsPerturbationThreshold);
+		}
+	};
+	
+
+};
+
+
+#endif //CUSTOM_CONVEX_CONVEX_PAIR_COLLISION_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.cpp
@@ -0,0 +1,45 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "CustomConvexShape.h"
+#include "ConvexHeightFieldShape.h"
+#include "BulletCollision/CollisionShapes/btConvexPolyhedron.h"
+
+
+CustomConvexShape::CustomConvexShape(const btScalar* points,int numPoints, int stride)
+:btConvexHullShape(points,numPoints,stride),
+m_acceleratedCompanionShapeIndex(-1)
+{
+	m_shapeType = CUSTOM_POLYHEDRAL_SHAPE_TYPE;
+
+	initializePolyhedralFeatures();
+	int numFaces= m_polyhedron->m_faces.size();
+	float4* eqn = new float4[numFaces];
+	for (int i=0;i<numFaces;i++)
+	{
+		eqn[i].x = m_polyhedron->m_faces[i].m_plane[0];
+		eqn[i].y = m_polyhedron->m_faces[i].m_plane[1];
+		eqn[i].z = m_polyhedron->m_faces[i].m_plane[2];
+		eqn[i].w = m_polyhedron->m_faces[i].m_plane[3];
+	}
+	
+	m_ConvexHeightField = new ConvexHeightField(eqn,numFaces);
+
+}
+
+CustomConvexShape::~CustomConvexShape()
+{
+	delete m_ConvexHeightField;
+}
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/CustomConvexShape.h
@@ -0,0 +1,35 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef CUSTOM_CONVEX_SHAPE_H
+#define CUSTOM_CONVEX_SHAPE_H
+
+#include "BulletCollision/CollisionShapes/btConvexHullShape.h"
+
+class CustomConvexShape  : public btConvexHullShape
+{
+	public:
+		
+		class ConvexHeightField* m_ConvexHeightField;
+
+		int m_acceleratedCompanionShapeIndex;
+
+		CustomConvexShape(const btScalar* points,int numPoints,int stride);
+		virtual ~CustomConvexShape();
+		
+};
+
+#endif //CUSTOM_CONVEX_SHAPE_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlAabb.h
@@ -0,0 +1,230 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifndef AABB_H
+#define AABB_H
+
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlQuaternion.h"
+
+enum AdlCollisionShapeTypes
+{
+	ADL_SHAPE_SPHERE=2,
+	ADL_SHAPE_HEIGHT_FIELD,
+	SHAPE_CONVEX_HEIGHT_FIELD,
+};
+
+_MEM_CLASSALIGN16
+struct Aabb
+{
+	public:
+		_MEM_ALIGNED_ALLOCATOR16;
+
+		__inline
+		void setEmpty();
+		__inline
+		void includeVolume( const Aabb& aabb );
+		__inline
+		void includePoint( const float4& p );
+		__inline
+		bool overlaps( const float4& p ) const;
+		__inline
+		bool overlaps( const Aabb& aabb ) const;
+		__inline
+		float4 center() const;
+		__inline
+		int getMajorAxis() const;
+		__inline
+		float4 getExtent() const;
+		__inline
+		void expandBy( const float4& r );
+
+		__inline
+		static bool overlaps( const Aabb& a, const Aabb& b );
+
+		__inline
+		bool intersect(const float4* from, const float4* to, const float4* invRay) const;
+
+		__inline
+		void transform(const float4& translation, const Quaternion& quat);
+
+		__inline
+		void transform(const float4& translation, const Matrix3x3& rot);
+
+	public:
+		float4 m_max;
+		float4 m_min;
+};
+
+void Aabb::setEmpty()
+{
+	m_max = make_float4( -FLT_MAX );
+	m_min = make_float4( FLT_MAX );
+}
+
+void Aabb::includeVolume(const Aabb& aabb)
+{
+	m_max.x = max2( m_max.x, aabb.m_max.x );
+	m_min.x = min2( m_min.x, aabb.m_min.x );
+
+	m_max.y = max2( m_max.y, aabb.m_max.y );
+	m_min.y = min2( m_min.y, aabb.m_min.y );
+
+	m_max.z = max2( m_max.z, aabb.m_max.z );
+	m_min.z = min2( m_min.z, aabb.m_min.z );
+}
+
+void Aabb::includePoint( const float4& p )
+{
+	m_max.x = max2( m_max.x, p.x );
+	m_min.x = min2( m_min.x, p.x );
+
+	m_max.y = max2( m_max.y, p.y );
+	m_min.y = min2( m_min.y, p.y );
+
+	m_max.z = max2( m_max.z, p.z );
+	m_min.z = min2( m_min.z, p.z );
+}
+
+bool Aabb::overlaps( const float4& p ) const
+{
+	float4 dx = m_max-p;
+	float4 dm = p-m_min;
+
+	return (dx.x >= 0 && dx.y >= 0 && dx.z >= 0)
+		&& (dm.x >= 0 && dm.y >= 0 && dm.z >= 0);
+}
+
+bool Aabb::overlaps( const Aabb& in ) const
+{
+/*
+	if( m_max.x < in.m_min.x || m_min.x > in.m_max.x ) return false;
+	if( m_max.y < in.m_min.y || m_min.y > in.m_max.y ) return false;
+	if( m_max.z < in.m_min.z || m_min.z > in.m_max.z ) return false;
+
+	return true;
+*/
+	return overlaps( *this, in );
+}
+
+bool Aabb::overlaps( const Aabb& a, const Aabb& b )
+{
+	if( a.m_max.x < b.m_min.x || a.m_min.x > b.m_max.x ) return false;
+	if( a.m_max.y < b.m_min.y || a.m_min.y > b.m_max.y ) return false;
+	if( a.m_max.z < b.m_min.z || a.m_min.z > b.m_max.z ) return false;
+
+	return true;
+}
+
+float4 Aabb::center() const
+{
+	return 0.5f*(m_max+m_min);
+}
+
+int Aabb::getMajorAxis() const
+{
+	float4 extent = getExtent();
+
+	int majorAxis = 0;
+	if( extent.s[1] > extent.s[0] )
+		majorAxis = 1;
+	if( extent.s[2] > extent.s[majorAxis] )
+		majorAxis = 2;
+
+	return majorAxis;
+}
+
+float4 Aabb::getExtent() const
+{
+	return m_max-m_min;
+}
+
+void Aabb::expandBy( const float4& r )
+{
+	m_max += r;
+	m_min -= r;
+}
+
+bool Aabb::intersect(const float4* from, const float4* to, const float4* invRay) const
+{
+	float4 dFar;
+	dFar = (m_max - *from);
+	dFar *= *invRay;
+	float4 dNear;
+	dNear = (m_min - *from);
+	dNear *= *invRay;
+		
+	float4 tFar; 
+	tFar = max2(dFar, dNear);
+	float4 tNear; 
+	tNear = min2(dFar, dNear);
+
+	float farf[] = { tFar.x, tFar.y, tFar.z };
+
+	float nearf[] = { tNear.x, tNear.y, tNear.z };
+
+	float minFar = min2(farf[0], min2(farf[1], farf[2]));
+	float maxNear = max2(nearf[0], max2(nearf[1], nearf[2]));
+	
+	minFar = min2(1.0f, minFar );
+	maxNear = max2(0.0f, maxNear);
+	
+	return (minFar >= maxNear);
+}
+
+void Aabb::transform(const float4& translation, const Matrix3x3& m)
+{
+	float4 c = center();
+
+	Aabb& ans = *this;
+
+	float4 e[] = { m.m_row[0]*m_min, m.m_row[1]*m_min, m.m_row[2]*m_min };
+	float4 f[] = { m.m_row[0]*m_max, m.m_row[1]*m_max, m.m_row[2]*m_max };
+	ans.m_max = ans.m_min = translation;
+
+	{	int j=0;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.x += mi.x+mi.y+mi.z;
+		ans.m_max.x += ma.x+ma.y+ma.z;
+	}
+
+	{	int j=1;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.y += mi.x+mi.y+mi.z;
+		ans.m_max.y += ma.x+ma.y+ma.z;
+	}
+
+	{	int j=2;
+		float4 mi = make_float4( min2( e[j].x, f[j].x ), min2( e[j].y, f[j].y ), min2( e[j].z, f[j].z ) );
+		float4 ma = make_float4( max2( e[j].x, f[j].x ), max2( e[j].y, f[j].y ), max2( e[j].z, f[j].z ) );
+
+		ans.m_min.z += mi.x+mi.y+mi.z;
+		ans.m_max.z += ma.x+ma.y+ma.z;
+	}
+}
+
+void Aabb::transform(const float4& translation, const Quaternion& quat)
+{
+	Matrix3x3 m = qtGetRotationMatrix( quat );
+
+	transform( translation, m );
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlArray.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlArray.h
@@ -0,0 +1,212 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ARRAY_H
+#define ARRAY_H
+
+#include <string.h>
+#include <malloc.h>
+#include <Common/Base/Error.h>
+#include <new.h>
+
+
+template <class T>
+class Array
+{
+	public:
+		__inline
+		Array();
+		__inline
+		Array(int size);
+		__inline
+		~Array();
+		__inline
+		T& operator[] (int idx);
+		__inline
+		const T& operator[] (int idx) const;
+		__inline
+		void pushBack(const T& elem);
+		__inline
+		void popBack();
+		__inline
+		void clear();
+		__inline
+		void setSize(int size);
+		__inline
+		int getSize() const;
+		__inline
+		T* begin();
+		__inline
+		const T* begin() const;
+		__inline
+		int indexOf(const T& data) const;
+		__inline
+		void removeAt(int idx);
+		__inline
+		T& expandOne();
+
+	private:
+		Array(const Array& a){}
+
+	private:
+		enum
+		{
+			DEFAULT_SIZE = 128,
+			INCREASE_SIZE = 128,
+		};
+
+		T* m_data;
+		int m_size;
+		int m_capacity;
+};
+
+template<class T>
+Array<T>::Array()
+{
+	m_size = 0;
+	m_capacity = DEFAULT_SIZE;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::Array(int size)
+{
+	m_size = size;
+	m_capacity = size;
+//	m_data = new T[ m_capacity ];
+	m_data = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+	for(int i=0; i<m_capacity; i++) new(&m_data[i])T;
+}
+
+template<class T>
+Array<T>::~Array()
+{
+	if( m_data )
+	{
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = NULL;
+	}
+}
+
+template<class T>
+T& Array<T>::operator[](int idx)
+{
+	CLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+const T& Array<T>::operator[](int idx) const
+{
+	CLASSERT(idx<m_size);
+	return m_data[idx];
+}
+
+template<class T>
+void Array<T>::pushBack(const T& elem)
+{
+	if( m_size == m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity += INCREASE_SIZE;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_data[ m_size++ ] = elem;
+}
+
+template<class T>
+void Array<T>::popBack()
+{
+	CLASSERT( m_size>0 );
+	m_size--;
+}
+
+template<class T>
+void Array<T>::clear()
+{
+	m_size = 0;
+}
+
+template<class T>
+void Array<T>::setSize(int size)
+{
+	if( size > m_capacity )
+	{
+		int oldCap = m_capacity;
+		m_capacity = size;
+//		T* s = new T[m_capacity];
+		T* s = (T*)_aligned_malloc(sizeof(T)*m_capacity, 16);
+		for(int i=0; i<m_capacity; i++) new(&s[i])T;
+		memcpy( s, m_data, sizeof(T)*oldCap );
+//		delete [] m_data;
+		_aligned_free( m_data );
+		m_data = s;
+	}
+	m_size = size;
+}
+
+template<class T>
+int Array<T>::getSize() const
+{
+	return m_size;
+}
+
+template<class T>
+const T* Array<T>::begin() const
+{
+	return m_data;
+}
+
+template<class T>
+T* Array<T>::begin()
+{
+	return m_data;
+}
+
+template<class T>
+int Array<T>::indexOf(const T& data) const
+{
+	for(int i=0; i<m_size; i++)
+	{
+		if( data == m_data[i] ) return i;
+	}
+	return -1;
+}
+
+template<class T>
+void Array<T>::removeAt(int idx)
+{
+	CLASSERT(idx<m_size);
+	m_data[idx] = m_data[--m_size];
+}
+
+template<class T>
+T& Array<T>::expandOne()
+{
+	setSize( m_size+1 );
+	return m_data[ m_size-1 ];
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollideUtils.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollideUtils.h
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef COLLIDE_UTILS_H
+#define COLLIDE_UTILS_H
+
+#include "Stubs/AdlMath.h"
+
+
+class CollideUtils
+{
+	public:
+		template<bool FLIPSIGN>
+		static bool collide(const float4& a, const float4& b, const float4& c, const float4& p, float4& normalOut, float margin = 0.f);
+
+		__inline
+		static float castRay(const float4& v0, const float4& v1, const float4& v2,
+			 const float4& rayFrom, const float4& rayTo, float margin = 0.0f, float4* bCrdOut = NULL);
+
+};
+
+
+template<bool FLIPSIGN>
+bool CollideUtils::collide(const float4& a, const float4& b, const float4& c, const float4& p, float4& normalOut, float margin)
+{
+	float4 ab, bc, ca;
+	ab = b-a;
+	bc = c-b;
+	ca = a-c;
+
+	float4 ap, bp, cp;
+	ap = p-a;
+	bp = p-b;
+	cp = p-c;
+
+	float4 n;
+	n = cross3(ab, -1.f*ca);
+
+	float4 abp = cross3( ab, ap );
+	float4 bcp = cross3( bc, bp );
+	float4 cap = cross3( ca, cp );
+
+	float s0 = dot3F4(n,abp);
+	float s1 = dot3F4(n,bcp);
+	float s2 = dot3F4(n,cap);
+
+//	if(( s0<0.f && s1<0.f && s2<0.f ) || ( s0>0.f && s1>0.f && s2>0.f ))
+	if(( s0<margin && s1<margin && s2<margin ) || ( s0>-margin && s1>-margin && s2>-margin ))
+	{
+		n = normalize3( n );
+		n.w = dot3F4(n,ap);
+
+		normalOut = (FLIPSIGN)? -n : n;
+		return true;
+	}
+
+	return false;
+}
+
+__inline
+float CollideUtils::castRay(const float4& v0, const float4& v1, const float4& v2,
+			 const float4& rayFrom, const float4& rayTo, float margin, float4* bCrdOut)
+{
+	float t, v, w;
+	float4 ab; ab = v1 - v0;
+	float4 ac; ac = v2 - v0;
+	float4 qp; qp = rayFrom - rayTo;
+	float4 normal = cross3( ab, ac );
+	float d = dot3F4( qp, normal );
+	float odd = 1.f/d;
+	float4 ap; ap = rayFrom - v0;
+	t = dot3F4( ap, normal );
+	t *= odd;
+//	if( t < 0.f || t > 1.f ) return -1;
+
+	float4 e = cross3( qp, ap );
+	v = dot3F4( ac, e );
+	v *= odd;
+	if( v < -margin || v > 1.f+margin ) return -1;
+	w = -dot3F4( ab, e );
+	w *= odd;
+//	if( w < 0.f || w > 1.f ) return -1;
+	if( w < -margin || w > 1.f+margin ) return -1;
+
+	float u = 1.f-v-w;
+	if( u < -margin || u > 1.f+margin ) return -1;
+	
+	if( bCrdOut )
+	{
+		bCrdOut->x = u;
+		bCrdOut->y = v;
+		bCrdOut->z = w;
+	}
+	return t;
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollisionShape.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlCollisionShape.h
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef COLLISION_SHAPE_H
+#define COLLISION_SHAPE_H
+
+#include "Stubs/AdlMath.h"
+#include "Stubs/AdlAabb.h"
+
+
+_MEM_CLASSALIGN16
+class CollisionShape
+{
+	public:
+		_MEM_ALIGNED_ALLOCATOR16;
+
+		enum Type
+		{
+			SHAPE_HEIGHT_FIELD,
+			SHAPE_CONVEX_HEIGHT_FIELD,
+			SHAPE_PLANE,
+			MAX_NUM_SHAPE_TYPES,
+		};
+
+		CollisionShape( Type type, float collisionMargin = 0.0025f ) : m_type( type ){ m_collisionMargin = collisionMargin; }
+		virtual ~CollisionShape(){}
+		virtual float queryDistance(const float4& p) const = 0;
+		virtual bool queryDistanceWithNormal(const float4& p, float4& normalOut) const = 0;
+
+	public:
+		Type m_type;
+		Aabb m_aabb;
+		float m_collisionMargin;
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlConstraint4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlConstraint4.h
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_CONSTRAINT4_H
+#define ADL_CONSTRAINT4_H
+
+
+
+struct Constraint4
+		{
+			_MEM_ALIGNED_ALLOCATOR16;
+
+			float4 m_linear;
+			float4 m_worldPos[4];
+			float4 m_center;	//	friction
+			float m_jacCoeffInv[4];
+			float m_b[4];
+			float m_appliedRambdaDt[4];
+
+			float m_fJacCoeffInv[2];	//	friction
+			float m_fAppliedRambdaDt[2];	//	friction
+
+			u32 m_bodyA;
+			u32 m_bodyB;
+
+			u32 m_batchIdx;
+			u32 m_paddings[1];
+
+			__inline
+			void setFrictionCoeff(float value) { m_linear.w = value; }
+			__inline
+			float getFrictionCoeff() const { return m_linear.w; }
+		};
+
+#endif //ADL_CONSTRAINT4_H
+		
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlContact4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlContact4.h
@@ -0,0 +1,102 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_CONTACT4_H
+#define ADL_CONTACT4_H
+
+#ifdef CL_PLATFORM_AMD
+#include "AdlConstraint4.h"
+#include "Adl/Adl.h"
+
+typedef adl::Buffer<Constraint4>* SolverData;
+#else
+typedef void* SolverData;
+#endif
+
+typedef void* ShapeDataType;
+
+
+struct Contact4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+
+	float4 m_worldPos[4];
+	float4 m_worldNormal;
+//	float m_restituitionCoeff;
+//	float m_frictionCoeff;
+	u16 m_restituitionCoeffCmp;
+	u16 m_frictionCoeffCmp;
+	int m_batchIdx;
+
+	u32 m_bodyAPtr;
+	u32 m_bodyBPtr;
+
+	//	todo. make it safer
+	int& getBatchIdx() { return m_batchIdx; }
+	float getRestituitionCoeff() const { return ((float)m_restituitionCoeffCmp/(float)0xffff); }
+	void setRestituitionCoeff( float c ) { ADLASSERT( c >= 0.f && c <= 1.f ); m_restituitionCoeffCmp = (u16)(c*0xffff); }
+	float getFrictionCoeff() const { return ((float)m_frictionCoeffCmp/(float)0xffff); }
+	void setFrictionCoeff( float c ) { ADLASSERT( c >= 0.f && c <= 1.f ); m_frictionCoeffCmp = (u16)(c*0xffff); }
+
+	float& getNPoints() { return m_worldNormal.w; }
+	float getNPoints() const { return m_worldNormal.w; }
+
+	float getPenetration(int idx) const { return m_worldPos[idx].w; }
+
+	bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+};
+
+struct ContactPoint4
+		{
+			float4 m_worldPos[4];
+			union
+			{
+				float4 m_worldNormal;
+
+				struct Data
+				{
+					int m_padding[3];
+					float m_nPoints;	//	for cl
+				}m_data;
+
+			};
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+//			int m_nPoints;
+//			int m_padding0;
+
+			void* m_bodyAPtr;
+			void* m_bodyBPtr;
+//			int m_padding1;
+//			int m_padding2;
+
+			float& getNPoints() { return m_data.m_nPoints; }
+			float getNPoints() const { return m_data.m_nPoints; }
+
+			float getPenetration(int idx) const { return m_worldPos[idx].w; }
+
+//			__inline
+//			void load(int idx, const ContactPoint& src);
+//			__inline
+//			void store(int idx, ContactPoint& dst) const;
+
+			bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+
+		};
+
+
+#endif //ADL_CONTACT4_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlError.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlError.h
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_ERROR_H
+#define CL_ERROR_H
+
+#ifdef DX11RENDER
+#include <windows.h>
+#endif
+
+#ifdef _DEBUG
+	#include <assert.h>
+	#define CLASSERT(x) if(!(x)){__debugbreak(); }
+	#define ADLASSERT(x) if(!(x)){__debugbreak(); }
+#else
+	#define CLASSERT(x) if(x){}
+	#define ADLASSERT(x) if(x){}
+
+#endif
+
+
+
+
+#ifdef _DEBUG
+	#define COMPILE_TIME_ASSERT(x) {int compileTimeAssertFailed[x]; compileTimeAssertFailed[0];}
+#else
+	#define COMPILE_TIME_ASSERT(x)
+#endif
+
+#ifdef _DEBUG
+	#include <stdarg.h>
+	#include <stdio.h>
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+		va_list arg;
+		va_start(arg, fmt);
+#ifdef DX11RENDER
+		char buf[256];
+		vsprintf_s( buf, 256, fmt, arg );
+#ifdef UNICODE
+		WCHAR wbuf[256];
+		int sizeWide = MultiByteToWideChar(0,0,buf,-1,wbuf,0);
+		MultiByteToWideChar(0,0,buf,-1,wbuf,sizeWide);
+
+//		swprintf_s( wbuf, 256, L"%s", buf );
+		OutputDebugString( wbuf );
+#else
+		OutputDebugString( buf );
+#endif
+#else
+		vprintf(fmt, arg);
+#endif
+		va_end(arg);
+	}
+#else
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+	}
+#endif
+
+
+#define WARN(msg) debugPrintf("WARNING: %s\n", msg);
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMath.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMath.h
@@ -0,0 +1,216 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef CL_MATH_H
+#define CL_MATH_H
+
+#include <stdlib.h>
+#include <math.h>
+#include <float.h>
+#include <xmmintrin.h>
+
+
+#include "AdlError.h"
+#include <algorithm>
+#define pxSort std::sort
+
+#define PI       3.14159265358979323846f
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+
+#define _MEM_CLASSALIGN16 __declspec(align(16))
+#define _MEM_ALIGNED_ALLOCATOR16 	void* operator new(size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete(void *p) { _aligned_free( p ); } \
+	void* operator new[](size_t size) { return _aligned_malloc( size, 16 ); } \
+	void operator delete[](void *p) { _aligned_free( p ); } \
+	void* operator new(size_t size, void* p) { return p; } \
+	void operator delete(void *p, void* pp) {} 
+
+
+
+template<class T>
+T nextPowerOf2(T n)
+{
+	n -= 1;
+	for(int i=0; i<sizeof(T)*8; i++)
+		n = n | (n>>i);
+	return n+1;
+}
+
+
+_MEM_CLASSALIGN16
+struct float4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			float x,y,z,w;
+		};
+		struct
+		{
+			float s[4];
+		};
+		__m128 m_quad;
+	};
+};
+
+__forceinline
+unsigned int isZero(const float4& a)
+{
+	return (a.x == 0.f) & (a.y == 0.f) & (a.z == 0.f) & (a.w == 0.f);
+}
+
+_MEM_CLASSALIGN16
+struct int4
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	union
+	{
+		struct
+		{
+			int x,y,z,w;
+		};
+		struct
+		{
+			int s[4];
+		};
+	};
+};
+
+struct int2
+{
+	union
+	{
+		struct
+		{
+			int x,y;
+		};
+		struct
+		{
+			int s[2];
+		};
+	};
+};
+
+struct float2
+{
+	union
+	{
+		struct
+		{
+			float x,y;
+		};
+		struct
+		{
+			float s[2];
+		};
+	};
+};
+
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+
+
+#include "Adlfloat4.inl"
+//#include <Common/Math/float4SSE.inl>
+
+
+
+
+template<typename T>
+void swap2(T& a, T& b)
+{
+	T tmp = a;
+	a = b;
+	b = tmp;
+}
+
+
+__inline
+void randSeed(int seed)
+{
+	srand( seed );
+}
+
+template<typename T>
+__inline
+T randRange(const T& minV, const T& maxV)
+{
+	float r = (rand()%10000)/10000.f;
+	T range = maxV - minV;
+	return (T)(minV + r*range);
+}
+
+template<>
+__inline
+float4 randRange(const float4& minV, const float4& maxV)
+{
+	float4 r = make_float4( (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f, (rand()%10000)/10000.f );
+	float4 range = maxV - minV;
+	return (minV + r*range);
+}
+
+
+struct SortData
+{
+	union
+	{
+		u32 m_key;
+		struct { u16 m_key16[2]; };
+	};
+	u32 m_value;
+
+	friend bool operator <(const SortData& a, const SortData& b)
+	{
+		return a.m_key < b.m_key;
+	}
+};
+
+
+
+template<typename T>
+T* addByteOffset(void* baseAddr, u32 offset)
+{
+	return (T*)(((u32)baseAddr)+offset);
+}
+
+
+struct Pair32
+{
+	Pair32(){}
+	Pair32(u32 a, u32 b) : m_a(a), m_b(b){}
+
+	u32 m_a;
+	u32 m_b;
+};
+
+struct PtrPair
+{
+	PtrPair(){}
+	PtrPair(void* a, void* b) : m_a(a), m_b(b){}
+	template<typename T>
+	PtrPair(T* a, T* b) : m_a((void*)a), m_b((void*)b){}
+
+	void* m_a;
+	void* m_b;
+};
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMatrix3x3.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlMatrix3x3.h
@@ -0,0 +1,194 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef MATRIX3X3_H
+#define MATRIX3X3_H
+
+#include "AdlMath.h"
+
+///////////////////////////////////////
+//	Matrix3x3
+///////////////////////////////////////
+
+typedef 
+_MEM_CLASSALIGN16 struct
+{
+	_MEM_ALIGNED_ALLOCATOR16;
+	float4 m_row[3];
+}Matrix3x3;
+
+__inline
+Matrix3x3 mtZero();
+
+__inline
+Matrix3x3 mtIdentity();
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c);
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b);
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b);
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b);
+
+__inline
+float4 mtMul3(const float4& b, const Matrix3x3& a);
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m);
+
+__inline
+Matrix3x3 mtZero()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(0.f);
+	m.m_row[1] = make_float4(0.f);
+	m.m_row[2] = make_float4(0.f);
+	return m;
+}
+
+__inline
+Matrix3x3 mtIdentity()
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(1,0,0);
+	m.m_row[1] = make_float4(0,1,0);
+	m.m_row[2] = make_float4(0,0,1);
+	return m;
+}
+
+__inline
+Matrix3x3 mtDiagonal(float a, float b, float c)
+{
+	Matrix3x3 m;
+	m.m_row[0] = make_float4(a,0,0);
+	m.m_row[1] = make_float4(0,b,0);
+	m.m_row[2] = make_float4(0,0,c);
+	return m;
+}
+
+__inline
+Matrix3x3 mtTranspose(const Matrix3x3& m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = make_float4(m.m_row[0].s[0], m.m_row[1].s[0], m.m_row[2].s[0], 0.f);
+	out.m_row[1] = make_float4(m.m_row[0].s[1], m.m_row[1].s[1], m.m_row[2].s[1], 0.f);
+	out.m_row[2] = make_float4(m.m_row[0].s[2], m.m_row[1].s[2], m.m_row[2].s[2], 0.f);
+	return out;
+}
+
+__inline
+Matrix3x3 mtMul(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	for(int i=0; i<3; i++)
+	{
+		ans.m_row[i].s[0] = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].s[1] = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].s[2] = dot3F4(a.m_row[i],transB.m_row[2]);
+	}
+	return ans;
+}
+
+__inline
+float4 mtMul1(const Matrix3x3& a, const float4& b)
+{
+	float4 ans;
+	ans.s[0] = dot3F4( a.m_row[0], b );
+	ans.s[1] = dot3F4( a.m_row[1], b );
+	ans.s[2] = dot3F4( a.m_row[2], b );
+	return ans;
+}
+
+__inline
+Matrix3x3 mtMul2(float a, const Matrix3x3& b)
+{
+	Matrix3x3 ans;
+	ans.m_row[0] = a*b.m_row[0];
+	ans.m_row[1] = a*b.m_row[1];
+	ans.m_row[2] = a*b.m_row[2];
+	return ans;
+}
+
+__inline
+float4 mtMul3(const float4& a, const Matrix3x3& b)
+{
+	float4 ans;
+	ans.x = a.x*b.m_row[0].x + a.y*b.m_row[1].x + a.z*b.m_row[2].x;
+	ans.y = a.x*b.m_row[0].y + a.y*b.m_row[1].y + a.z*b.m_row[2].y;
+	ans.z = a.x*b.m_row[0].z + a.y*b.m_row[1].z + a.z*b.m_row[2].z;
+	return ans;
+}
+
+__inline
+Matrix3x3 mtInvert(const Matrix3x3& m)
+{
+	float det = m.m_row[0].s[0]*m.m_row[1].s[1]*m.m_row[2].s[2]+m.m_row[1].s[0]*m.m_row[2].s[1]*m.m_row[0].s[2]+m.m_row[2].s[0]*m.m_row[0].s[1]*m.m_row[1].s[2]
+	-m.m_row[0].s[0]*m.m_row[2].s[1]*m.m_row[1].s[2]-m.m_row[2].s[0]*m.m_row[1].s[1]*m.m_row[0].s[2]-m.m_row[1].s[0]*m.m_row[0].s[1]*m.m_row[2].s[2];
+
+	CLASSERT( det );
+
+	Matrix3x3 ans;
+	ans.m_row[0].s[0] = m.m_row[1].s[1]*m.m_row[2].s[2] - m.m_row[1].s[2]*m.m_row[2].s[1];
+	ans.m_row[0].s[1] = m.m_row[0].s[2]*m.m_row[2].s[1] - m.m_row[0].s[1]*m.m_row[2].s[2];
+	ans.m_row[0].s[2] = m.m_row[0].s[1]*m.m_row[1].s[2] - m.m_row[0].s[2]*m.m_row[1].s[1];
+	ans.m_row[0].w = 0.f;
+
+	ans.m_row[1].s[0] = m.m_row[1].s[2]*m.m_row[2].s[0] - m.m_row[1].s[0]*m.m_row[2].s[2];
+	ans.m_row[1].s[1] = m.m_row[0].s[0]*m.m_row[2].s[2] - m.m_row[0].s[2]*m.m_row[2].s[0];
+	ans.m_row[1].s[2] = m.m_row[0].s[2]*m.m_row[1].s[0] - m.m_row[0].s[0]*m.m_row[1].s[2];
+	ans.m_row[1].w = 0.f;
+
+	ans.m_row[2].s[0] = m.m_row[1].s[0]*m.m_row[2].s[1] - m.m_row[1].s[1]*m.m_row[2].s[0];
+	ans.m_row[2].s[1] = m.m_row[0].s[1]*m.m_row[2].s[0] - m.m_row[0].s[0]*m.m_row[2].s[1];
+	ans.m_row[2].s[2] = m.m_row[0].s[0]*m.m_row[1].s[1] - m.m_row[0].s[1]*m.m_row[1].s[0];
+	ans.m_row[2].w = 0.f;
+
+	ans = mtMul2((1.0f/det), ans);
+	return ans;
+}
+
+__inline
+Matrix3x3 mtSet( const float4& a, const float4& b, const float4& c )
+{
+	Matrix3x3 m;
+	m.m_row[0] = a;
+	m.m_row[1] = b;
+	m.m_row[2] = c;
+	return m;
+}
+
+__inline
+Matrix3x3 operator+(const Matrix3x3& a, const Matrix3x3& b)
+{
+	Matrix3x3 out;
+	out.m_row[0] = a.m_row[0] + b.m_row[0];
+	out.m_row[1] = a.m_row[1] + b.m_row[1];
+	out.m_row[2] = a.m_row[2] + b.m_row[2];
+	return out;
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlQuaternion.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlQuaternion.h
@@ -0,0 +1,155 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef QUATERNION_H
+#define QUATERNION_H
+
+#include "AdlMatrix3x3.h"
+
+
+typedef float4 Quaternion;
+
+__inline
+Quaternion qtSet(const float4& axis, float angle);
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b);
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec);
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec);
+
+__inline
+Quaternion qtInvert(const Quaternion& q);
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat);
+
+__inline
+Quaternion qtNormalize(const Quaternion& q);
+
+__inline
+Quaternion qtGetIdentity() { return make_float4(0,0,0,1); }
+
+__inline
+Quaternion qtSet(const float4& axis, float angle)
+{
+	float4 nAxis = normalize3( axis );
+
+	Quaternion q;
+	q.s[0] = nAxis.s[0]*sin(angle/2);
+	q.s[1] = nAxis.s[1]*sin(angle/2);
+	q.s[2] = nAxis.s[2]*sin(angle/2);
+	q.s[3] = cos(angle/2);
+	return q;
+}
+
+__inline
+Quaternion qtMul(const Quaternion& a, const Quaternion& b)
+{
+	Quaternion ans;
+	ans = cross3( a, b );
+	ans += a.s[3]*b + b.s[3]*a;
+	ans.s[3] = a.s[3]*b.s[3] - (a.s[0]*b.s[0]+a.s[1]*b.s[1]+a.s[2]*b.s[2]);
+	return ans;
+}
+
+__inline
+float4 qtRotate(const Quaternion& q, const float4& vec)
+{
+	Quaternion vecQ = vec;
+	vecQ.s[3] = 0.f;
+	Quaternion qInv = qtInvert( q );
+	float4 out = qtMul(qtMul(q,vecQ),qInv);
+	return out;
+}
+
+__inline
+float4 qtInvRotate(const Quaternion& q, const float4& vec)
+{
+	return qtRotate( qtInvert( q ), vec );
+}
+
+__inline
+Quaternion qtInvert(const Quaternion& q)
+{
+	Quaternion ans;
+	ans.s[0] = -q.s[0];
+	ans.s[1] = -q.s[1];
+	ans.s[2] = -q.s[2];
+	ans.s[3] = q.s[3];
+	return ans;
+}
+
+__inline
+Matrix3x3 qtGetRotationMatrix(const Quaternion& quat)
+{
+	float4 quat2 = make_float4(quat.s[0]*quat.s[0], quat.s[1]*quat.s[1], quat.s[2]*quat.s[2], 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].s[0]=1-2*quat2.s[1]-2*quat2.s[2];
+	out.m_row[0].s[1]=2*quat.s[0]*quat.s[1]-2*quat.s[3]*quat.s[2];
+	out.m_row[0].s[2]=2*quat.s[0]*quat.s[2]+2*quat.s[3]*quat.s[1];
+	out.m_row[0].s[3] = 0.f;
+
+	out.m_row[1].s[0]=2*quat.s[0]*quat.s[1]+2*quat.s[3]*quat.s[2];
+	out.m_row[1].s[1]=1-2*quat2.s[0]-2*quat2.s[2];
+	out.m_row[1].s[2]=2*quat.s[1]*quat.s[2]-2*quat.s[3]*quat.s[0];
+	out.m_row[1].s[3] = 0.f;
+
+	out.m_row[2].s[0]=2*quat.s[0]*quat.s[2]-2*quat.s[3]*quat.s[1];
+	out.m_row[2].s[1]=2*quat.s[1]*quat.s[2]+2*quat.s[3]*quat.s[0];
+	out.m_row[2].s[2]=1-2*quat2.s[0]-2*quat2.s[1];
+	out.m_row[2].s[3] = 0.f;
+
+	return out;
+}
+
+__inline
+Quaternion qtGetQuaternion(const Matrix3x3* m)
+{
+	Quaternion q;
+	q.w = sqrtf( m[0].m_row[0].x + m[0].m_row[1].y + m[0].m_row[2].z + 1 ) * 0.5f;
+	float inv4w = 1.f/(4.f*q.w);
+	q.x = (m[0].m_row[2].y-m[0].m_row[1].z)*inv4w;
+	q.y = (m[0].m_row[0].z-m[0].m_row[2].x)*inv4w;
+	q.z = (m[0].m_row[1].x-m[0].m_row[0].y)*inv4w;
+
+	return q;
+}
+
+__inline
+Quaternion qtNormalize(const Quaternion& q)
+{
+	return normalize4(q);
+}
+
+__inline
+float4 transform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( orientation, p ) + translation;
+}
+
+__inline
+float4 invTransform(const float4& p, const float4& translation, const Quaternion& orientation)
+{
+	return qtRotate( qtInvert( orientation ), p-translation ); // use qtInvRotate
+}
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlRigidBody.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlRigidBody.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_RIGID_BODY_H
+#define ADL_RIGID_BODY_H
+
+#include "AdlQuaternion.h"
+
+class RigidBodyBase
+{
+	public:
+
+		_MEM_CLASSALIGN16
+		struct Body
+		{
+			_MEM_ALIGNED_ALLOCATOR16;
+
+			float4 m_pos;
+			Quaternion m_quat;
+			float4 m_linVel;
+			float4 m_angVel;
+
+			u32 m_shapeIdx;
+			u32 m_shapeType;
+
+			float m_invMass;
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+			
+		};
+
+		struct Inertia
+		{
+/*			u16 m_shapeType;
+			u16 m_shapeIdx;
+			float m_restituitionCoeff;
+			float m_frictionCoeff;
+			int m_padding;
+*/
+			Matrix3x3 m_invInertia;
+			Matrix3x3 m_initInvInertia;
+		};
+};
+
+#endif// ADL_RIGID_BODY_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlTransform.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/AdlTransform.h
@@ -0,0 +1,61 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef _ADL_TRANSFORM_H
+#define _ADL_TRANSFORM_H
+
+#include "AdlMath.h"
+#include "AdlQuaternion.h"
+#include "AdlMatrix3x3.h"
+
+struct Transform
+{
+	float4 m_translation;
+	Matrix3x3 m_rotation;
+};
+
+Transform trSetTransform(const float4& translation, const Quaternion& quat)
+{
+	Transform tr;
+	tr.m_translation = translation;
+	tr.m_rotation = qtGetRotationMatrix( quat );
+	return tr;
+}
+
+Transform trInvert( const Transform& tr )
+{
+	Transform ans;
+	ans.m_rotation = mtTranspose( tr.m_rotation );
+	ans.m_translation = mtMul1( ans.m_rotation, -tr.m_translation );
+	return ans;
+}
+
+Transform trMul(const Transform& trA, const Transform& trB)
+{
+	Transform ans; 
+	ans.m_rotation = mtMul( trA.m_rotation, trB.m_rotation );
+	ans.m_translation = mtMul1( trA.m_rotation, trB.m_translation ) + trA.m_translation;
+	return ans;
+}
+
+float4 trMul1(const Transform& tr, const float4& p)
+{
+	return mtMul1( tr.m_rotation, p ) + tr.m_translation;
+}
+
+
+#endif //_ADL_TRANSFORM_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4.inl
@@ -0,0 +1,373 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.x = x; v.y = y; v.z = z; v.w = w;
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 out;
+	out.s[0] = a.s[0]*b.s[0];
+	out.s[1] = a.s[1]*b.s[1];
+	out.s[2] = a.s[2]*b.s[2];
+	out.s[3] = a.s[3]*b.s[3];
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(a*b.s[0], a*b.s[1], a*b.s[2], a*b.s[3]);
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b.s[0];
+	a.s[1]*=b.s[1];
+	a.s[2]*=b.s[2];
+	a.s[3]*=b.s[3];
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]*=b;
+	a.s[1]*=b;
+	a.s[2]*=b;
+	a.s[3]*=b;
+}
+
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]/b.s[0];
+	out.s[1] = a.s[1]/b.s[1];
+	out.s[2] = a.s[2]/b.s[2];
+	out.s[3] = a.s[3]/b.s[3];
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	return make_float4(b.s[0]/a, b.s[1]/a, b.s[2]/a, b.s[3]/a);
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a.s[0]/=b.s[0];
+	a.s[1]/=b.s[1];
+	a.s[2]/=b.s[2];
+	a.s[3]/=b.s[3];
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	a.s[0]/=b;
+	a.s[1]/=b;
+	a.s[2]/=b;
+	a.s[3]/=b;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b.s[0];
+	out.s[1] = a.s[1]+b.s[1];
+	out.s[2] = a.s[2]+b.s[2];
+	out.s[3] = a.s[3]+b.s[3];
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]+b;
+	out.s[1] = a.s[1]+b;
+	out.s[2] = a.s[2]+b;
+	out.s[3] = a.s[3]+b;
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b.s[0];
+	out.s[1] = a.s[1]-b.s[1];
+	out.s[2] = a.s[2]-b.s[2];
+	out.s[3] = a.s[3]-b.s[3];
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.s[0] = a.s[0]-b;
+	out.s[1] = a.s[1]-b;
+	out.s[2] = a.s[2]-b;
+	out.s[3] = a.s[3]-b;
+	return out;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b.s[0];
+	a.s[1]+=b.s[1];
+	a.s[2]+=b.s[2];
+	a.s[3]+=b.s[3];
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]+=b;
+	a.s[1]+=b;
+	a.s[2]+=b;
+	a.s[3]+=b;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b.s[0];
+	a.s[1]-=b.s[1];
+	a.s[2]-=b.s[2];
+	a.s[3]-=b.s[3];
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a.s[0]-=b;
+	a.s[1]-=b;
+	a.s[2]-=b;
+	a.s[3]-=b;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{
+	return make_float4(a.s[1]*b.s[2]-a.s[2]*b.s[1], 
+		a.s[2]*b.s[0]-a.s[0]*b.s[2], 
+		a.s[0]*b.s[1]-a.s[1]*b.s[0], 
+		0);
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4SSE.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Adlfloat4SSE.inl
@@ -0,0 +1,381 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define CHECK_ALIGNMENT(a) CLASSERT((u32(&(a)) & 0xf) == 0);
+#define CHECK_ALIGNMENT(a) a;
+
+
+__inline
+float4 make_float4(float x, float y, float z, float w = 0.f)
+{
+	float4 v;
+	v.m_quad = _mm_set_ps(w,z,y,x);
+
+	return v;
+}
+
+__inline
+float4 make_float4(float x)
+{
+	return make_float4(x,x,x,x);
+}
+
+__inline
+float4 make_float4(const int4& x)
+{
+	return make_float4((float)x.s[0], (float)x.s[1], (float)x.s[2], (float)x.s[3]);
+}
+
+__inline
+float2 make_float2(float x, float y)
+{
+	float2 v;
+	v.s[0] = x; v.s[1] = y;
+	return v;
+}
+
+__inline
+float2 make_float2(float x)
+{
+	return make_float2(x,x);
+}
+
+__inline
+float2 make_float2(const int2& x)
+{
+	return make_float2((float)x.s[0], (float)x.s[1]);
+}
+
+__inline
+int4 make_int4(int x, int y, int z, int w = 0)
+{
+	int4 v;
+	v.s[0] = x; v.s[1] = y; v.s[2] = z; v.s[3] = w;
+	return v;
+}
+
+__inline
+int4 make_int4(int x)
+{
+	return make_int4(x,x,x,x);
+}
+
+__inline
+int4 make_int4(const float4& x)
+{
+	return make_int4((int)x.x, (int)x.y, (int)x.z, (int)x.w);
+}
+
+__inline
+int2 make_int2(int a, int b)
+{
+	int2 ans; ans.x = a; ans.y = b;
+	return ans;
+}
+
+__inline
+float4 operator-(const float4& a)
+{
+	float4 zero; zero.m_quad = _mm_setzero_ps();
+	float4 ans; ans.m_quad = _mm_sub_ps( zero.m_quad, a.m_quad );
+	return ans;
+}
+
+__inline
+float4 operator*(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_mul_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator*(float a, const float4& b)
+{
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	return av*b;
+}
+
+__inline
+float4 operator*(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	return av*b;
+}
+
+__inline
+void operator*=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a*b;
+}
+
+__inline
+void operator*=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	a = a*bv;
+}
+
+//
+__inline
+float4 operator/(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_div_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator/(const float4& b, float a)
+{
+	CHECK_ALIGNMENT(b);
+
+	float4 av; av.m_quad = _mm_set1_ps( a );
+	float4 out;
+	out = b/av;
+	return out;
+}
+
+__inline
+void operator/=(float4& a, const float4& b)
+{
+	a = a/b;
+}
+
+__inline
+void operator/=(float4& a, float b)
+{
+	CLASSERT((u32(&a) & 0xf) == 0);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	a = a/bv;
+}
+//
+
+__inline
+float4 operator+(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_add_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator+(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	return a+bv;
+}
+
+__inline
+float4 operator-(const float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 out;
+	out.m_quad = _mm_sub_ps( a.m_quad, b.m_quad );
+	return out;
+}
+
+__inline
+float4 operator-(const float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+	return a-bv;
+}
+
+__inline
+void operator+=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a + b;
+}
+
+__inline
+void operator+=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+
+	a = a + bv;
+}
+
+__inline
+void operator-=(float4& a, const float4& b)
+{
+	CHECK_ALIGNMENT(a);
+
+	a = a - b;
+}
+
+__inline
+void operator-=(float4& a, float b)
+{
+	CHECK_ALIGNMENT(a);
+
+	float4 bv; bv.m_quad = _mm_set1_ps( b );
+
+	a = a - bv;
+}
+
+
+
+
+
+__inline
+float4 cross3(const float4& a, const float4& b)
+{	//	xnamathvector.inl
+	union IntVec
+	{
+		unsigned int m_i[4];
+		__m128 m_v;
+	};
+
+	IntVec mask3 = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000};
+	__m128 V1 = a.m_quad;
+	__m128 V2 = b.m_quad;
+
+    __m128 vTemp1 = _mm_shuffle_ps(V1,V1,_MM_SHUFFLE(3,0,2,1));
+    // z2,x2,y2,w2
+    __m128 vTemp2 = _mm_shuffle_ps(V2,V2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the left operation
+    __m128 vResult = _mm_mul_ps(vTemp1,vTemp2);
+    // z1,x1,y1,w1
+    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp1,_MM_SHUFFLE(3,0,2,1));
+    // y2,z2,x2,w2
+    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(3,1,0,2));
+    // Perform the right operation
+    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
+    // Subract the right from left, and return answer
+    vResult = _mm_sub_ps(vResult,vTemp1);
+    // Set w to zero
+	float4 ans; ans.m_quad = _mm_and_ps(vResult,mask3.m_v);
+	return ans;
+}
+
+__inline
+float dot3F4(const float4& a, const float4& b)
+{
+//	return a.x*b.x+a.y*b.y+a.z*b.z;
+    // Perform the dot product
+	__m128 V1 = a.m_quad;
+	__m128 V2 = b.m_quad;
+
+	__m128 vDot = _mm_mul_ps(V1,V2);
+    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
+    __m128 vTemp = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(2,1,2,1));
+    // Result.vector4_f32[0] = x+y
+    vDot = _mm_add_ss(vDot,vTemp);
+    // x=Dot.vector4_f32[2]
+    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,1,1,1));
+    // Result.vector4_f32[0] = (x+y)+z
+    vDot = _mm_add_ss(vDot,vTemp);
+    // Splat x
+	float4 ans; ans.m_quad = _mm_shuffle_ps(vDot,vDot,_MM_SHUFFLE(0,0,0,0));
+	return ans.x;
+}
+
+__inline
+float length3(const float4& a)
+{
+	return sqrtf(dot3F4(a,a));
+}
+
+__inline
+float dot4(const float4& a, const float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+
+//	for height
+__inline
+float dot3w1(const float4& point, const float4& eqn)
+{
+	return point.x*eqn.x+point.y*eqn.y+point.z*eqn.z+eqn.w;
+}
+
+__inline
+float4 normalize3(const float4& a)
+{
+	float length = sqrtf(dot3F4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 normalize4(const float4& a)
+{
+	float length = sqrtf(dot4(a, a));
+	return 1.f/length * a;
+}
+
+__inline
+float4 createEquation(const float4& a, const float4& b, const float4& c)
+{
+	float4 eqn;
+	float4 ab = b-a;
+	float4 ac = c-a;
+	eqn = normalize3( cross3(ab, ac) );
+	eqn.w = -dot3F4(eqn,a);
+	return eqn;
+}
+
+
+template<typename T>
+__inline
+T max2(const T& a, const T& b)
+{
+	return (a>b)? a:b;
+}
+
+template<typename T>
+__inline
+T min2(const T& a, const T& b)
+{
+	return (a<b)? a:b;
+}
+
+template<>
+__inline
+float4 max2(const float4& a, const float4& b)
+{
+	return make_float4( max2(a.x,b.x), max2(a.y,b.y), max2(a.z,b.z), max2(a.w,b.w) );
+}
+
+template<>
+__inline
+float4 min2(const float4& a, const float4& b)
+{
+	return make_float4( min2(a.x,b.x), min2(a.y,b.y), min2(a.z,b.z), min2(a.w,b.w) );
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowPhase.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowPhase.h
@@ -0,0 +1,154 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+//#include <Common/Base/SyncObjects.h>
+
+#include "AdlMath.h"
+#include "AdlContact4.h"
+#include "AdlRigidBody.h"
+
+#include "../ConvexHeightFieldShape.h"
+
+//#include "TypeDefinition.h"
+//#include "RigidBody.h"
+//#include "ConvexHeightFieldShape.h"
+
+namespace adl
+{
+class ShapeBase;
+
+class ChNarrowphaseBase
+{
+	public:
+		struct Config
+		{
+			float m_collisionMargin;
+		};
+/*
+		typedef struct
+		{
+			//	m_normal.w == height in u8
+			float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];
+
+			float m_scale;
+			float m_padding0;
+			float m_padding1;
+			float m_padding2;
+		} ShapeData;
+*/
+};
+
+template<DeviceType TYPE>
+class ChNarrowphase : public ChNarrowphaseBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_supportCullingKernel;
+			Kernel* m_narrowphaseKernel;
+			Kernel* m_narrowphaseWithPlaneKernel;
+
+			Buffer<u32>* m_counterBuffer;
+		};
+
+		enum
+		{
+			N_TASKS = 4,
+			HEIGHT_RES = ConvexHeightField::HEIGHT_RES,
+		};
+
+		struct ShapeData
+		{
+			float4 m_normal[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_height4[HEIGHT_RES*HEIGHT_RES*6];
+			u32 m_supportHeight4[HEIGHT_RES*HEIGHT_RES*6];
+
+			float m_scale;
+			float m_padding0;
+			float m_padding1;
+			float m_padding2;
+		};
+
+		struct ConstData
+		{
+			int m_nPairs;
+			float m_collisionMargin;
+			int m_capacity;
+			int m_paddings[1];
+		};
+		
+		static
+		Data* allocate( const Device* device );
+
+		static
+		void deallocate( Data* data );
+/*
+		static
+		Buffer<ShapeData>* allocateShapeBuffer( const Device* device, int capacity );
+
+		static
+		void deallocateShapeBuffer( Buffer<ShapeData>* shapeBuf );
+
+		static
+		void setShape( Buffer<ShapeData>* shapeBuf, ShapeBase* shape, int idx, float collisionMargin );
+*/
+		static
+		ShapeDataType allocateShapeBuffer( const Device* device, int capacity );
+
+		static
+		void deallocateShapeBuffer( ShapeDataType shapeBuf );
+
+		static
+		void setShape( ShapeDataType shapeBuf, ShapeBase* shape, int idx, float collisionMargin = 0.f );
+		
+		static
+		void setShape( ShapeDataType shapeBuf, ConvexHeightField* cvxShape, int idx, float collisionMargin = 0.f );
+
+		// Run NarrowphaseKernel
+		//template<bool USE_OMP>
+		static
+		void execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg );
+
+		// Run NarrowphaseWithPlaneKernel
+		//template<bool USE_OMP>
+		static
+		void execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			const Buffer<float4>* vtxBuf, const Buffer<int4>* idxBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg );
+
+		// Run SupportCullingKernel
+		//template<bool USE_OMP>
+		static
+		int culling( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf, const Buffer<int2>* pairsOut, const Config& cfg );
+};
+
+//#include <AdlPhysics/Narrowphase/ChNarrowphase.inl>
+//#include <AdlPhysics/Narrowphase/ChNarrowphaseHost.inl>
+
+#include "ChNarrowphase.inl"
+
+};
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphase.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphase.inl
@@ -0,0 +1,303 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+//#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\ChNarrowphaseKernels"
+#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\ChNarrowphaseKernels"
+#define KERNEL0 "SupportCullingKernel"
+#define KERNEL1 "NarrowphaseKernel"
+
+#include "ChNarrowphaseKernels.h"
+
+class ChNarrowphaseImp
+{
+public:
+	static
+	__inline
+	u32 u32Pack(u8 x, u8 y, u8 z, u8 w)
+	{
+		return (x) | (y<<8) | (z<<16) | (w<<24);
+	}
+
+};
+
+template<DeviceType TYPE>
+typename ChNarrowphase<TYPE>::Data* ChNarrowphase<TYPE>::allocate( const Device* device )
+{
+	char options[100];
+	
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{narrowphaseKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+	
+
+
+	//sprintf(options, "-I ..\\..\\ -Wf,--c++");
+	sprintf(options, "-I .\\NarrowPhaseCL\\");
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_supportCullingKernel = device->getKernel( PATH, KERNEL0, options,src[TYPE] );
+	data->m_narrowphaseKernel = device->getKernel( PATH, KERNEL1, options, src[TYPE]);
+	data->m_narrowphaseWithPlaneKernel = device->getKernel( PATH, "NarrowphaseWithPlaneKernel", options,src[TYPE]);
+	data->m_counterBuffer = new Buffer<u32>( device, 1 );
+
+	return data;
+}
+
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::deallocate( Data* data )
+{
+	delete data->m_counterBuffer;
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+ShapeDataType ChNarrowphase<TYPE>::allocateShapeBuffer( const Device* device, int capacity )
+{
+	ADLASSERT( device->m_type == TYPE );
+
+	return new Buffer<ShapeData>( device, capacity );
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::deallocateShapeBuffer( ShapeDataType shapeBuf )
+{
+	Buffer<ShapeData>* s = (Buffer<ShapeData>*)shapeBuf;
+	delete s;
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::setShape( ShapeDataType shapeBuf, ShapeBase* shape, int idx, float collisionMargin )
+{
+	ConvexHeightField* cvxShape = new ConvexHeightField( shape );
+	Buffer<ShapeData>* dst = (Buffer<ShapeData>*)shapeBuf;
+	cvxShape->m_aabb.expandBy( make_float4( collisionMargin ) );
+	{
+		ShapeData s;
+		{
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6; j++)
+			{
+				s.m_normal[j] = cvxShape->m_normal[j];
+			}
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6/4; j++)
+			{
+				s.m_height4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_data[4*j], cvxShape->m_data[4*j+1], cvxShape->m_data[4*j+2], cvxShape->m_data[4*j+3] );
+				s.m_supportHeight4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_supportHeight[4*j], cvxShape->m_supportHeight[4*j+1], cvxShape->m_supportHeight[4*j+2], cvxShape->m_supportHeight[4*j+3] );
+			}
+			s.m_scale = cvxShape->m_scale;
+		}
+		dst->write( &s, 1, idx );
+		DeviceUtils::waitForCompletion( dst->m_device );
+	}
+	delete cvxShape;
+}
+
+template<DeviceType TYPE>
+void ChNarrowphase<TYPE>::setShape( ShapeDataType shapeBuf, ConvexHeightField* cvxShape, int idx, float collisionMargin )
+{
+	Buffer<ShapeData>* dst = (Buffer<ShapeData>*)shapeBuf;
+	cvxShape->m_aabb.expandBy( make_float4( collisionMargin ) );
+	{
+		ShapeData s;
+		{
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6; j++)
+			{
+				s.m_normal[j] = cvxShape->m_normal[j];
+			}
+			for(int j=0; j<HEIGHT_RES*HEIGHT_RES*6/4; j++)
+			{
+				s.m_height4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_data[4*j], cvxShape->m_data[4*j+1], cvxShape->m_data[4*j+2], cvxShape->m_data[4*j+3] );
+				s.m_supportHeight4[j] = ChNarrowphaseImp::u32Pack( cvxShape->m_supportHeight[4*j], cvxShape->m_supportHeight[4*j+1], cvxShape->m_supportHeight[4*j+2], cvxShape->m_supportHeight[4*j+3] );
+			}
+			s.m_scale = cvxShape->m_scale;
+		}
+		dst->write( &s, 1, idx );
+		DeviceUtils::waitForCompletion( dst->m_device );
+	}
+}
+
+// Run NarrowphaseKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+void ChNarrowphase<TYPE>::execute( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg )
+{
+	if( nPairs == 0 ) return;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+	Buffer<Contact4>* gContactOutNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, contactOut );	//	this might not be empty
+
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = contactOut->getSize() - nContacts;
+
+	u32 n = nContacts;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gContactOutNative ),
+			BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_narrowphaseKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs*64, 64 );
+	}
+
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gContactOutNative, contactOut );
+
+	nContacts = min2((int)n, contactOut->getSize() );
+}
+
+// Run NarrowphaseWithPlaneKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+void ChNarrowphase<TYPE>::execute( Data* data, const Buffer<int2>* pairs, int nPairs, 
+			const Buffer<RigidBodyBase::Body>* bodyBuf, const ShapeDataType shapeBuf,
+			const Buffer<float4>* vtxBuf, const Buffer<int4>* idxBuf,
+			Buffer<Contact4>* contactOut, int& nContacts, const Config& cfg )
+{
+	if( nPairs == 0 ) return;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );	
+	Buffer<Contact4>* gContactOutNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, contactOut );	//	this might not be empty
+
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = contactOut->getSize() - nContacts;
+
+	u32 n = nContacts;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gContactOutNative ),
+			BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_narrowphaseWithPlaneKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs*64, 64 );
+	}
+
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gContactOutNative, contactOut );
+
+	nContacts = min2((int)n, contactOut->getSize() );
+}
+
+// Run SupportCullingKernel
+template<DeviceType TYPE>
+//template<bool USE_OMP>
+int ChNarrowphase<TYPE>::culling( Data* data, const Buffer<int2>* pairs, int nPairs, const Buffer<RigidBodyBase::Body>* bodyBuf,
+			const ShapeDataType shapeBuf, const Buffer<int2>* pairsOut, const Config& cfg )
+{
+	if( nPairs == 0 ) return 0;
+
+	Buffer<ShapeData>* shapeBuffer = (Buffer<ShapeData>*)shapeBuf;
+	ADLASSERT( shapeBuffer->getType() == TYPE );
+
+	const Device* device = data->m_device;
+
+	Buffer<int2>* gPairsInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, pairs );
+	Buffer<RigidBodyBase::Body>* gBodyInNative 
+		= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );	
+	Buffer<int2>* gPairsOutNative 
+		= BufferUtils::map<TYPE, false>( data->m_device, pairsOut );
+
+	//
+	Buffer<ConstData> constBuffer( device, 1, BufferBase::BUFFER_CONST );
+
+	ConstData cdata;
+	cdata.m_nPairs = nPairs;
+	cdata.m_collisionMargin = cfg.m_collisionMargin;
+	cdata.m_capacity = pairsOut->getSize();
+
+	u32 n = 0;
+	data->m_counterBuffer->write( &n, 1 );
+//	DeviceUtils::waitForCompletion( device );
+	{
+		BufferInfo bInfo[] = { BufferInfo( gPairsInNative, true ), BufferInfo( shapeBuffer ), BufferInfo( gBodyInNative ), 
+			BufferInfo( gPairsOutNative ), BufferInfo( data->m_counterBuffer ) };
+		Launcher launcher( data->m_device, data->m_supportCullingKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nPairs, 64 );
+	}
+	data->m_counterBuffer->read( &n, 1 );
+	DeviceUtils::waitForCompletion( device );
+/*
+	if( gPairsInNative != pairs ) delete gPairsInNative;
+	if( gBodyInNative != bodyBuf ) delete gBodyInNative;
+	if( gPairsOutNative != pairsOut ) 
+	{
+		gPairsOutNative->read( pairsOut->m_ptr, n );
+		DeviceUtils::waitForCompletion( device );
+		delete gPairsOutNative;
+	}
+*/
+	BufferUtils::unmap<false>( gPairsInNative, pairs );
+	BufferUtils::unmap<false>( gBodyInNative, bodyBuf );
+	BufferUtils::unmap<true>( gPairsOutNative, pairsOut );
+
+	return min2((int)n, pairsOut->getSize() );
+}
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.cl
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/ChNarrowphaseKernels.h
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.h
@@ -0,0 +1,203 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+#ifndef __ADL_SOLVER_H
+#define __ADL_SOLVER_H
+
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Search/BoundSearch.h>
+#include <AdlPrimitives/Sort/RadixSort.h>
+#include <AdlPrimitives/Scan/PrefixScan.h>
+#include <AdlPrimitives/Sort/RadixSort32.h>
+
+//#include <AdlPhysics/TypeDefinition.h>
+#include "AdlRigidBody.h"
+#include "AdlContact4.h"
+
+//#include "AdlPhysics/Batching/Batching.h>
+
+
+#define MYF4 float4
+#define MAKE_MYF4 make_float4
+
+//#define MYF4 float4sse
+//#define MAKE_MYF4 make_float4sse
+
+#include "AdlConstraint4.h"
+
+namespace adl
+{
+class SolverBase
+{
+	public:
+		
+
+		struct ConstraintData
+		{
+			ConstraintData(): m_b(0.f), m_appliedRambdaDt(0.f) {}
+
+			float4 m_linear; // have to be normalized
+			float4 m_angular0;
+			float4 m_angular1;
+			float m_jacCoeffInv;
+			float m_b;
+			float m_appliedRambdaDt;
+
+			u32 m_bodyAPtr;
+			u32 m_bodyBPtr;
+
+			bool isInvalid() const { return ((u32)m_bodyAPtr+(u32)m_bodyBPtr) == 0; }
+			float getFrictionCoeff() const { return m_linear.w; }
+			void setFrictionCoeff(float coeff) { m_linear.w = coeff; }
+		};
+
+		struct ConstraintCfg
+		{
+			ConstraintCfg( float dt = 0.f ): m_positionDrift( 0.005f ), m_positionConstraintCoeff( 0.2f ), m_dt(dt), m_staticIdx(-1) {}
+
+			float m_positionDrift;
+			float m_positionConstraintCoeff;
+			float m_dt;
+			bool m_enableParallelSolve;
+			float m_averageExtent;
+			int m_staticIdx;
+		};
+
+		static
+		__inline
+		Buffer<Contact4>* allocateContact4( const Device* device, int capacity )
+		{
+			return new Buffer<Contact4>( device, capacity );	
+		}
+
+		static
+		__inline
+		void deallocateContact4( Buffer<Contact4>* data ) { delete data; }
+
+		static
+		__inline
+		SolverData allocateConstraint4( const Device* device, int capacity )
+		{
+			return new Buffer<Constraint4>( device, capacity );
+		}
+
+		static
+		__inline
+		void deallocateConstraint4( SolverData data ) { delete (Buffer<Constraint4>*)data; }
+
+		static
+		__inline
+		void* allocateFrictionConstraint( const Device* device, int capacity, u32 type = 0 )
+		{
+			return 0;
+		}
+
+		static
+		__inline
+		void deallocateFrictionConstraint( void* data ) 
+		{
+		}
+
+		enum
+		{
+			N_SPLIT = 16,
+			N_BATCHES = 4,
+			N_OBJ_PER_SPLIT = 10,
+			N_TASKS_PER_BATCH = N_SPLIT*N_SPLIT,
+		};
+};
+
+template<DeviceType TYPE>
+class Solver : public SolverBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			Data() : m_nIterations(4){}
+
+			const Device* m_device;
+			void* m_parallelSolveData;
+			int m_nIterations;
+			Kernel* m_batchingKernel;
+			Kernel* m_batchSolveKernel;
+			Kernel* m_contactToConstraintKernel;
+			Kernel* m_setSortDataKernel;
+			Kernel* m_reorderContactKernel;
+			Kernel* m_copyConstraintKernel;
+			//typename RadixSort<TYPE>::Data* m_sort;
+			typename RadixSort32<TYPE>::Data* m_sort32;
+			typename BoundSearch<TYPE>::Data* m_search;
+			typename PrefixScan<TYPE>::Data* m_scan;
+			Buffer<SortData>* m_sortDataBuffer;
+			Buffer<Contact4>* m_contactBuffer;
+		};
+
+		enum
+		{
+			DYNAMIC_CONTACT_ALLOCATION_THRESHOLD = 2000000,
+		};
+
+		static
+		Data* allocate( const Device* device, int pairCapacity );
+
+		static
+		void deallocate( Data* data );
+
+		static
+		void reorderConvertToConstraints( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+		const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void solveContactConstraint( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* inertiaBuf, 
+			SolverData constraint, void* additionalData, int n );
+
+//		static
+//		int createSolveTasks( int batchIdx, Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+//			SolverData constraint, int n, ThreadPool::Task* tasksOut[], int taskCapacity );
+
+
+		//private:
+		static
+		void convertToConstraints( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void sortContacts( Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const ConstraintCfg& cfg );
+
+		static
+		void batchContacts( Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx );
+
+};
+
+#include "Solver.inl"
+#include "SolverHost.inl"
+};
+
+#undef MYF4
+#undef MAKE_MYF4
+
+#endif //__ADL_SOLVER_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
@@ -0,0 +1,762 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\SolverKernels"
+#define BATCHING_PATH "..\\..\\dynamics\\basic_demo\\Stubs\\batchingKernels"
+
+#define KERNEL1 "SingleBatchSolveKernel"
+#define KERNEL2 "BatchSolveKernel"
+
+#define KERNEL3 "ContactToConstraintKernel"
+#define KERNEL4 "SetSortDataKernel"
+#define KERNEL5 "ReorderContactKernel"
+#include "SolverKernels.h"
+
+#include "batchingKernels.h"
+
+
+struct SolverDebugInfo
+{
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	
+	int m_valInt4;
+	int m_valInt5;
+	int m_valInt6;
+	int m_valInt7;
+
+	int m_valInt8;
+	int m_valInt9;
+	int m_valInt10;
+	int m_valInt11;
+
+	int	m_valInt12;
+	int	m_valInt13;
+	int	m_valInt14;
+	int	m_valInt15;
+
+
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+};
+
+
+
+
+class SolverDeviceInl
+{
+public:
+	struct ParallelSolveData
+	{
+		Buffer<u32>* m_numConstraints;
+		Buffer<u32>* m_offsets;
+	};
+};
+
+template<DeviceType TYPE>
+typename Solver<TYPE>::Data* Solver<TYPE>::allocate( const Device* device, int pairCapacity )
+{
+		const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{solverKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+		const char* src2[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{batchingKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+
+	
+
+	Data* data = new Data;
+	data->m_device = device;
+	bool cacheBatchingKernel = true;
+	data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", src2[TYPE],cacheBatchingKernel);
+	//data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", 0,cacheBatchingKernel);
+	bool cacheSolverKernel  = true;
+
+	data->m_batchSolveKernel = device->getKernel( PATH, KERNEL2, "-I ..\\..\\ ", src[TYPE],cacheSolverKernel );
+	data->m_contactToConstraintKernel = device->getKernel( PATH, KERNEL3, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_setSortDataKernel = device->getKernel( PATH, KERNEL4, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_reorderContactKernel = device->getKernel( PATH, KERNEL5, 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_copyConstraintKernel = device->getKernel( PATH, "CopyConstraintKernel", 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_parallelSolveData = new SolverDeviceInl::ParallelSolveData;
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		solveData->m_numConstraints = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+		solveData->m_offsets = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+	}
+	const int sortSize = NEXTMULTIPLEOF( pairCapacity, 512 );
+
+
+	//data->m_sort = RadixSort<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	data->m_sort32 = RadixSort32<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	
+	data->m_search = BoundSearch<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+	data->m_scan = PrefixScan<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+
+	data->m_sortDataBuffer = new Buffer<SortData>( data->m_device, sortSize );
+
+	if( pairCapacity < DYNAMIC_CONTACT_ALLOCATION_THRESHOLD )
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, pairCapacity );
+	else
+		data->m_contactBuffer = 0;
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::deallocate( Data* data )
+{
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		delete solveData->m_numConstraints;
+		delete solveData->m_offsets;
+		delete solveData;
+	}
+
+//	RadixSort<TYPE>::deallocate( data->m_sort );
+	RadixSort32<TYPE>::deallocate(data->m_sort32);
+	BoundSearch<TYPE>::deallocate( data->m_search );
+	PrefixScan<TYPE>::deallocate( data->m_scan );
+
+	delete data->m_sortDataBuffer;
+	if( data->m_contactBuffer ) delete data->m_contactBuffer;
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::reorderConvertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf,
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	if( data->m_contactBuffer )
+	{
+		if( data->m_contactBuffer->getSize() < nContacts )
+		{
+			BT_PROFILE("delete data->m_contactBuffer;");
+			delete data->m_contactBuffer;
+			data->m_contactBuffer = 0;
+		}
+	}
+	if( data->m_contactBuffer == 0 )
+	{
+		BT_PROFILE("new data->m_contactBuffer;");
+
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, nContacts );
+	}
+	Stopwatch sw;
+
+	Buffer<Contact4>* contactNative = BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn, nContacts );
+
+	//DeviceUtils::Config dhCfg;
+	//Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		DeviceUtils::waitForCompletion( data->m_device );
+		sw.start();
+		//	contactsIn -> data->m_contactBuffer
+		{
+			BT_PROFILE("sortContacts");
+			Solver<TYPE>::sortContacts( data, bodyBuf, contactNative, additionalData, nContacts, cfg );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		sw.split();
+		if(0)
+		{
+			Contact4* tmp = new Contact4[nContacts];
+			data->m_contactBuffer->read( tmp, nContacts );
+			DeviceUtils::waitForCompletion( data->m_contactBuffer->m_device );
+			contactNative->write( tmp, nContacts );
+			DeviceUtils::waitForCompletion( contactNative->m_device );
+			delete [] tmp;
+		}
+		else
+		{
+			BT_PROFILE("m_copyConstraintKernel");
+
+			Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+			int4 cdata; cdata.x = nContacts;
+			BufferInfo bInfo[] = { BufferInfo( data->m_contactBuffer ), BufferInfo( contactNative ) };
+//			Launcher launcher( data->m_device, data->m_device->getKernel( PATH, "CopyConstraintKernel",  "-I ..\\..\\ -Wf,--c++", 0 ) );
+			Launcher launcher( data->m_device, data->m_copyConstraintKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( nContacts, 64 );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		{
+			BT_PROFILE("batchContacts");
+			Solver<TYPE>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, cfg.m_staticIdx );
+
+		}
+	}
+	{
+			BT_PROFILE("waitForCompletion (batchContacts)");
+			DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.split();
+	//================
+	if(0)
+	{
+//		Solver<TYPE_HOST>::Data* solverHost = Solver<TYPE_HOST>::allocate( deviceHost, nContacts );
+//		Solver<TYPE_HOST>::convertToConstraints( solverHost, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+//		Solver<TYPE_HOST>::deallocate( solverHost );
+	}
+	else
+	{
+		BT_PROFILE("convertToConstraints");
+		Solver<TYPE>::convertToConstraints( data, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+	}
+	{
+		BT_PROFILE("convertToConstraints waitForCompletion");
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.stop();
+
+	{
+		BT_PROFILE("printf");
+
+		float t[5];
+		sw.getMs( t, 3 );
+//		printf("%3.2f, %3.2f, %3.2f, ", t[0], t[1], t[2]);
+	}
+
+	{
+		BT_PROFILE("deallocate and unmap");
+
+		//DeviceUtils::deallocate( deviceHost );
+
+		BufferUtils::unmap<true>( contactNative, contactsIn, nContacts );
+	}
+}
+
+
+template<DeviceType TYPE>
+void Solver<TYPE>::solveContactConstraint( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, void* additionalData, int n )
+{
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::solveContactConstraint( hostData, bodyBuf, shapeBuf, constraint, additionalData, n );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	ADLASSERT( data );
+
+	Buffer<Constraint4>* cBuffer =0;
+	
+	Buffer<RigidBodyBase::Body>* gBodyNative=0; 
+	Buffer<RigidBodyBase::Inertia>* gShapeNative =0;
+	Buffer<Constraint4>* gConstraintNative =0;
+	
+
+	{
+		BT_PROFILE("map");
+	cBuffer = (Buffer<Constraint4>*)constraint;
+
+		gBodyNative= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		gShapeNative= BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		gConstraintNative = BufferUtils::map<TYPE, true>( data->m_device, cBuffer );
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+
+	Buffer<int4> constBuffer;
+	int4 cdata = make_int4( n, 0, 0, 0 );
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		const int nn = N_SPLIT*N_SPLIT;
+
+		cdata.x = 0;
+		cdata.y = 250;
+
+#if 0
+//check how the cells are filled
+		unsigned int* hostCounts = new unsigned int[N_SPLIT*N_SPLIT];
+		solveData->m_numConstraints->read(hostCounts,N_SPLIT*N_SPLIT);
+		DeviceUtils::waitForCompletion( data->m_device );
+		for (int i=0;i<N_SPLIT*N_SPLIT;i++)
+		{
+			if (hostCounts[i])
+			{
+				printf("hostCounts[%d]=%d\n",i,hostCounts[i]);
+			}
+		}
+		delete[] hostCounts;
+#endif
+
+		int numWorkItems = 64*nn/N_BATCHES;
+#ifdef DEBUG_ME
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+#endif
+
+
+
+		{
+
+			BT_PROFILE("m_batchSolveKernel iterations");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+#ifdef DEBUG_ME
+					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+					gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+				
+
+					BufferInfo bInfo[] = { 
+
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets ) 
+#ifdef DEBUG_ME
+						,	BufferInfo(&gpuDebugInfo)
+#endif
+						};
+
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					
+					launcher.launch1D( numWorkItems, 64 );
+
+#ifdef DEBUG_ME
+					DeviceUtils::waitForCompletion( data->m_device );
+					gpuDebugInfo.read(debugInfo,numWorkItems);
+					DeviceUtils::waitForCompletion( data->m_device );
+					for (int i=0;i<numWorkItems;i++)
+					{
+						if (debugInfo[i].m_valInt2>0)
+						{
+							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+						}
+
+						if (debugInfo[i].m_valInt3>0)
+						{
+							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+						}
+					}
+#endif //DEBUG_ME
+
+
+				}
+			}
+		
+			DeviceUtils::waitForCompletion( data->m_device );
+
+
+		}
+
+		cdata.x = 1;
+		{
+			BT_PROFILE("m_batchSolveKernel iterations2");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+					BufferInfo bInfo[] = { 
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets )
+#ifdef DEBUG_ME
+						,BufferInfo(&gpuDebugInfo)
+#endif //DEBUG_ME
+					};
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					launcher.launch1D( 64*nn/N_BATCHES, 64 );
+				}
+			}
+			DeviceUtils::waitForCompletion( data->m_device );
+			
+		}
+#ifdef DEBUG_ME
+		delete[] debugInfo;
+#endif //DEBUG_ME
+	}
+
+	{
+		BT_PROFILE("unmap");
+	BufferUtils::unmap<true>( gBodyNative, bodyBuf );
+	BufferUtils::unmap<false>( gShapeNative, shapeBuf );
+	BufferUtils::unmap<true>( gConstraintNative, cBuffer );
+	DeviceUtils::waitForCompletion( data->m_device );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::convertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	Buffer<RigidBodyBase::Body>* bodyNative =0;
+	Buffer<RigidBodyBase::Inertia>* shapeNative =0;
+	Buffer<Contact4>* contactNative =0;
+	Buffer<Constraint4>* constraintNative =0;
+
+	{
+		BT_PROFILE("map buffers");
+
+		bodyNative = BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		shapeNative  = BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		contactNative= BufferUtils::map<TYPE, true>( data->m_device, contactsIn );
+		constraintNative = BufferUtils::map<TYPE, false>( data->m_device, (Buffer<Constraint4>*)contactCOut );
+	}
+	struct CB
+	{
+		int m_nContacts;
+		float m_dt;
+		float m_positionDrift;
+		float m_positionConstraintCoeff;
+	};
+
+	{
+		BT_PROFILE("m_contactToConstraintKernel");
+		CB cdata;
+		cdata.m_nContacts = nContacts;
+		cdata.m_dt = cfg.m_dt;
+		cdata.m_positionDrift = cfg.m_positionDrift;
+		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
+
+		Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( shapeNative ),
+			BufferInfo( constraintNative )};
+		Launcher launcher( data->m_device, data->m_contactToConstraintKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nContacts, 64 );	
+		DeviceUtils::waitForCompletion( data->m_device );
+
+	}
+
+	{
+		BT_PROFILE("unmap");
+		BufferUtils::unmap<false>( bodyNative, bodyBuf );
+		BufferUtils::unmap<false>( shapeNative, shapeBuf );
+		BufferUtils::unmap<false>( contactNative, contactsIn );
+		BufferUtils::unmap<true>( constraintNative, (Buffer<Constraint4>*)contactCOut );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::sortContacts( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+	Buffer<RigidBodyBase::Body>* bodyNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, bodyBuf );
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn );
+
+	const int sortAlignment = 512; // todo. get this out of sort
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
+
+		Buffer<u32>* countsNative = nativeSolveData->m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
+		Buffer<u32>* offsetsNative = nativeSolveData->m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
+
+		{	//	2. set cell idx
+			struct CB
+			{
+				int m_nContacts;
+				int m_staticIdx;
+				float m_scale;
+				int m_nSplit;
+			};
+
+			ADLASSERT( sortSize%64 == 0 );
+			CB cdata;
+			cdata.m_nContacts = nContacts;
+			cdata.m_staticIdx = cfg.m_staticIdx;
+			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
+			cdata.m_nSplit = N_SPLIT;
+
+			Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+			BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( data->m_sortDataBuffer ) };
+			Launcher launcher( data->m_device, data->m_setSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( sortSize, 64 );
+		}
+
+		{	//	3. sort by cell idx
+			int n = N_SPLIT*N_SPLIT;
+			int sortBit = 32;
+			//if( n <= 0xffff ) sortBit = 16;
+			//if( n <= 0xff ) sortBit = 8;
+			RadixSort32<TYPE>::execute( data->m_sort32, *data->m_sortDataBuffer,sortSize);
+		}
+		{	//	4. find entries
+			BoundSearch<TYPE>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, BoundSearchBase::COUNT );
+
+			PrefixScan<TYPE>::execute( data->m_scan, *countsNative, *offsetsNative, N_SPLIT*N_SPLIT );
+		}
+
+		{	//	5. sort constraints by cellIdx
+			//	todo. preallocate this
+//			ADLASSERT( contactsIn->getType() == TYPE_HOST );
+//			Buffer<Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer
+
+			{
+				Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+				int4 cdata; cdata.x = nContacts;
+				BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( data->m_contactBuffer ), BufferInfo( data->m_sortDataBuffer ) };
+				Launcher launcher( data->m_device, data->m_reorderContactKernel );
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+				launcher.setConst( constBuffer, cdata );
+				launcher.launch1D( nContacts, 64 );
+			}
+//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( contactNative, contactsIn );
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::batchContacts( typename Solver<TYPE>::Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	if(0)
+	{
+		BT_PROFILE("CPU classTestKernel/Kernel (batch generation?)");
+
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::batchContacts( hostData, contacts, nContacts, n, offsets, staticIdx );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contacts, nContacts );
+	Buffer<u32>* nNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, n );
+	Buffer<u32>* offsetsNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, offsets );
+
+	{
+		BT_PROFILE("GPU classTestKernel/Kernel (batch generation?)");
+		Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		int4 cdata;
+		cdata.x = nContacts;
+		cdata.y = 0;
+		cdata.z = staticIdx;
+
+		int numWorkItems = 64*N_SPLIT*N_SPLIT;
+#ifdef BATCH_DEBUG
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+		gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+		BufferInfo bInfo[] = { 
+			BufferInfo( contactNative ), 
+			BufferInfo( data->m_contactBuffer ), 
+			BufferInfo( nNative ), 
+			BufferInfo( offsetsNative ) 
+#ifdef BATCH_DEBUG
+			,	BufferInfo(&gpuDebugInfo)
+#endif
+		};
+
+		
+		
+		Launcher launcher( data->m_device, data->m_batchingKernel);
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( numWorkItems, 64 );
+		DeviceUtils::waitForCompletion( data->m_device );
+
+#ifdef BATCH_DEBUG
+	aaaa
+		Contact4* hostContacts = new Contact4[nContacts];
+		data->m_contactBuffer->read(hostContacts,nContacts);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		gpuDebugInfo.read(debugInfo,numWorkItems);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		for (int i=0;i<numWorkItems;i++)
+		{
+			if (debugInfo[i].m_valInt1>0)
+			{
+				printf("catch\n");
+			}
+			if (debugInfo[i].m_valInt2>0)
+			{
+				printf("catch22\n");
+			}
+
+			if (debugInfo[i].m_valInt3>0)
+			{
+				printf("catch666\n");
+			}
+
+			if (debugInfo[i].m_valInt4>0)
+			{
+				printf("catch777\n");
+			}
+		}
+		delete[] debugInfo;
+#endif //BATCH_DEBUG
+
+	}
+
+	if(0)
+	{
+		u32* nhost = new u32[N_SPLIT*N_SPLIT];
+
+		nNative->read( nhost, N_SPLIT*N_SPLIT );
+
+		Contact4* chost = new Contact4[nContacts];
+		data->m_contactBuffer->read( chost, nContacts );
+		DeviceUtils::waitForCompletion( data->m_device );
+		printf(">>");
+		int nonzero = 0;
+		u32 maxn = 0;
+		for(int i=0; i<N_SPLIT*N_SPLIT; i++)
+		{
+			printf("%d-", nhost[i]);
+			nonzero += (nhost[i]==0)? 0:1;
+			maxn = max2( nhost[i], maxn );
+		}
+		printf("\nnonzero:zero = %d:%d (%d)\n", nonzero, N_SPLIT*N_SPLIT-nonzero, maxn);
+		printf("\n\n");
+
+		int prev = 0;
+		int prevIdx = 0;
+		int maxNBatches = 0;
+		for(int i=0; i<nContacts; i++)
+		{
+//			printf("(%d, %d:%d),", chost[i].m_batchIdx, chost[i].m_bodyAPtr, chost[i].m_bodyBPtr);
+			if( prev != 0 && chost[i].m_batchIdx == 0 )
+			{
+				maxNBatches = max2( maxNBatches, prev );
+				printf("\n[%d]", prev);
+
+				//for(int j=prevIdx; j<i; j++)
+				//{
+				//	printf("(%d:%d),", chost[j].m_bodyAPtr, chost[j].m_bodyBPtr);
+				//}
+
+				//printf("\n");
+
+				prevIdx = i;
+			}
+
+			printf("%d,", chost[i].m_batchIdx);
+
+			prev = chost[i].m_batchIdx;
+		}
+		printf("\n");
+		printf("Max: %d\n", maxNBatches);
+
+		delete [] chost;
+		delete [] nhost;
+	}
+//	copy buffer to buffer
+	contactNative->write( *data->m_contactBuffer, nContacts );
+	DeviceUtils::waitForCompletion( data->m_device );
+
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			HostBuffer<Contact4> host( deviceHost, nContacts );
+			contactNative->read( host.m_ptr, nContacts );
+			DeviceUtils::waitForCompletion( data->m_device );
+
+			for(int i=0; i<nContacts; i++)
+			{
+				ADLASSERT( host[i].m_bodyAPtr <= (u32)staticIdx );
+				ADLASSERT( host[i].m_bodyBPtr <= (u32)staticIdx );
+			}
+		}
+		DeviceUtils::deallocate( deviceHost );
+	}
+
+	BufferUtils::unmap<true>( contactNative, contacts );
+	BufferUtils::unmap<false>( nNative, n );
+	BufferUtils::unmap<false>( offsetsNative, offsets );
+}
+
+#undef PATH
+#undef KERNEL1
+#undef KERNEL2
+
+#undef KERNEL3
+#undef KERNEL4
+#undef KERNEL5
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverHost.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverHost.inl
@@ -0,0 +1,848 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+class SolverInl
+{
+public:
+	typedef SolverBase::ConstraintData ConstraintData;
+
+
+	static
+	__forceinline
+	void setLinearAndAngular(const MYF4& n, const MYF4& r0, const MYF4& r1,
+							 MYF4& linear, MYF4& angular0, MYF4& angular1)
+	{
+		linear = -n;
+		angular0 = -cross3(r0, n);
+		angular1 = cross3(r1, n);
+	}
+
+	static
+	__forceinline
+	float calcJacCoeff(const MYF4& linear0, const MYF4& linear1, const MYF4& angular0, const MYF4& angular1,
+					  float invMass0, const Matrix3x3& invInertia0, float invMass1, const Matrix3x3& invInertia1)
+	{
+		//	linear0,1 are normlized
+		float jmj0 = invMass0;//dot3F4(linear0, linear0)*invMass0;
+		float jmj1 = dot3F4(mtMul3(angular0,invInertia0), angular0);
+		float jmj2 = invMass1;//dot3F4(linear1, linear1)*invMass1;
+		float jmj3 = dot3F4(mtMul3(angular1,invInertia1), angular1);
+		return -1.f/(jmj0+jmj1+jmj2+jmj3);
+	}
+	static
+	__forceinline
+	float calcRelVel(const MYF4& l0, const MYF4& l1, const MYF4& a0, const MYF4& a1, 
+					 const MYF4& linVel0, const MYF4& angVel0, const MYF4& linVel1, const MYF4& angVel1)
+	{
+		return dot3F4(l0, linVel0) + dot3F4(a0, angVel0) + dot3F4(l1, linVel1) + dot3F4(a1, angVel1);
+	}
+
+	static
+	__forceinline
+	void setConstraint4( const MYF4& posA, const MYF4& linVelA, const MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA, 
+		const MYF4& posB, const MYF4& linVelB, const MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		const Contact4& src, const SolverBase::ConstraintCfg& cfg, 
+		Constraint4& dstC )
+	{
+		dstC.m_bodyA = (u32)src.m_bodyAPtr;
+		dstC.m_bodyB = (u32)src.m_bodyBPtr;
+
+		float dtInv = 1.f/cfg.m_dt;
+		for(int ic=0; ic<4; ic++)
+		{
+			dstC.m_appliedRambdaDt[ic] = 0.f;
+		}
+		dstC.m_fJacCoeffInv[0] = dstC.m_fJacCoeffInv[1] = 0.f;
+
+
+		const MYF4& n = src.m_worldNormal;
+		dstC.m_linear = -n;
+		dstC.setFrictionCoeff( src.getFrictionCoeff() );
+		for(int ic=0; ic<4; ic++)
+		{
+			MYF4 r0 = src.m_worldPos[ic] - posA;
+			MYF4 r1 = src.m_worldPos[ic] - posB;
+
+			if( ic >= src.getNPoints() )
+			{
+				dstC.m_jacCoeffInv[ic] = 0.f;
+				continue;
+			}
+
+			float relVelN;
+			{
+				MYF4 linear, angular0, angular1;
+				setLinearAndAngular(n, r0, r1, linear, angular0, angular1);
+
+				dstC.m_jacCoeffInv[ic] = calcJacCoeff(linear, -linear, angular0, angular1,
+					invMassA, invInertiaA, invMassB, invInertiaB );
+
+				relVelN = calcRelVel(linear, -linear, angular0, angular1,
+					linVelA, angVelA, linVelB, angVelB);
+
+				float e = src.getRestituitionCoeff();
+				if( relVelN*relVelN < 0.004f ) e = 0.f;
+
+				dstC.m_b[ic] = e*relVelN;
+				dstC.m_b[ic] += (src.getPenetration(ic) + cfg.m_positionDrift)*cfg.m_positionConstraintCoeff*dtInv;
+				dstC.m_appliedRambdaDt[ic] = 0.f;
+			}
+		}
+
+		if( src.getNPoints() > 1 )
+		{	//	prepare friction
+			MYF4 center = MAKE_MYF4(0.f);
+			for(int i=0; i<src.getNPoints(); i++) center += src.m_worldPos[i];
+			center /= (float)src.getNPoints();
+
+			MYF4 tangent[2];
+			tangent[0] = cross3( src.m_worldNormal, src.m_worldPos[0]-center );
+			tangent[1] = cross3( tangent[0], src.m_worldNormal );
+			tangent[0] = normalize3( tangent[0] );
+			tangent[1] = normalize3( tangent[1] );
+			MYF4 r[2];
+			r[0] = center - posA;
+			r[1] = center - posB;
+
+			for(int i=0; i<2; i++)
+			{
+				MYF4 linear, angular0, angular1;
+				setLinearAndAngular(tangent[i], r[0], r[1], linear, angular0, angular1);
+
+				dstC.m_fJacCoeffInv[i] = calcJacCoeff(linear, -linear, angular0, angular1,
+					invMassA, invInertiaA, invMassB, invInertiaB );
+				dstC.m_fAppliedRambdaDt[i] = 0.f;
+			}
+			dstC.m_center = center;
+		}
+		else
+		{
+			//	single point constraint
+		}
+
+		for(int i=0; i<4; i++)
+		{
+			if( i<src.getNPoints() )
+			{
+				dstC.m_worldPos[i] = src.m_worldPos[i];
+			}
+			else
+			{
+				dstC.m_worldPos[i] = MAKE_MYF4(0.f);
+			}
+		}
+	}
+
+/*
+	struct Constraint4
+	{
+		float4 m_linear;			X
+		float4 m_angular0[4];		X
+		float4 m_angular1[4];		center
+		float m_jacCoeffInv[4];		[0,1]
+		float m_b[4];				X
+		float m_appliedRambdaDt[4];	[0,1]
+
+		void* m_bodyAPtr;			X
+		void* m_bodyBPtr;			X
+	};
+*/
+	static
+	__inline
+	void solveFriction(Constraint4& cs, 
+		const MYF4& posA, MYF4& linVelA, MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA,
+		const MYF4& posB, MYF4& linVelB, MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+		if( cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0 ) return;
+		const MYF4& center = cs.m_center;
+
+		MYF4 n = -cs.m_linear;
+
+		MYF4 tangent[2];
+		tangent[0] = cross3( n, cs.m_worldPos[0]-center );
+		tangent[1] = cross3( tangent[0], n );
+		tangent[0] = normalize3( tangent[0] );
+		tangent[1] = normalize3( tangent[1] );
+
+		MYF4 angular0, angular1, linear;
+		MYF4 r0 = center - posA;
+		MYF4 r1 = center - posB;
+		for(int i=0; i<2; i++)
+		{
+			setLinearAndAngular( tangent[i], r0, r1, linear, angular0, angular1 );
+			float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
+				linVelA, angVelA, linVelB, angVelB );
+			rambdaDt *= cs.m_fJacCoeffInv[i];
+
+				{
+					float prevSum = cs.m_fAppliedRambdaDt[i];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[i] );
+					updated = min2( updated, maxRambdaDt[i] );
+					rambdaDt = updated - prevSum;
+					cs.m_fAppliedRambdaDt[i] = updated;
+				}
+
+			MYF4 linImp0 = invMassA*linear*rambdaDt;
+			MYF4 linImp1 = invMassB*(-linear)*rambdaDt;
+			MYF4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+			MYF4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+
+			linVelA += linImp0;
+			angVelA += angImp0;
+			linVelB += linImp1;
+			angVelB += angImp1;
+		}
+
+		{	//	angular damping for point constraint
+			MYF4 ab = normalize3( posB - posA );
+			MYF4 ac = normalize3( center - posA );
+			if( dot3F4( ab, ac ) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
+			{
+				float angNA = dot3F4( n, angVelA );
+				float angNB = dot3F4( n, angVelB );
+
+				angVelA -= (angNA*0.1f)*n;
+				angVelB -= (angNB*0.1f)*n;
+			}
+		}
+	}
+
+	template<bool JACOBI>
+	static
+	__inline
+	void solveContact(Constraint4& cs, 
+		const MYF4& posA, MYF4& linVelA, MYF4& angVelA, float invMassA, const Matrix3x3& invInertiaA,
+		const MYF4& posB, MYF4& linVelB, MYF4& angVelB, float invMassB, const Matrix3x3& invInertiaB, 
+		float maxRambdaDt[4], float minRambdaDt[4])
+	{
+		MYF4 dLinVelA = MAKE_MYF4(0.f);
+		MYF4 dAngVelA = MAKE_MYF4(0.f);
+		MYF4 dLinVelB = MAKE_MYF4(0.f);
+		MYF4 dAngVelB = MAKE_MYF4(0.f);
+
+		for(int ic=0; ic<4; ic++)
+		{
+			//	dont necessary because this makes change to 0
+			if( cs.m_jacCoeffInv[ic] == 0.f ) continue;
+
+			{
+				MYF4 angular0, angular1, linear;
+				MYF4 r0 = cs.m_worldPos[ic] - posA;
+				MYF4 r1 = cs.m_worldPos[ic] - posB;
+				setLinearAndAngular( -cs.m_linear, r0, r1, linear, angular0, angular1 );
+
+				float rambdaDt = calcRelVel(cs.m_linear, -cs.m_linear, angular0, angular1,
+					linVelA, angVelA, linVelB, angVelB ) + cs.m_b[ic];
+				rambdaDt *= cs.m_jacCoeffInv[ic];
+
+				{
+					float prevSum = cs.m_appliedRambdaDt[ic];
+					float updated = prevSum;
+					updated += rambdaDt;
+					updated = max2( updated, minRambdaDt[ic] );
+					updated = min2( updated, maxRambdaDt[ic] );
+					rambdaDt = updated - prevSum;
+					cs.m_appliedRambdaDt[ic] = updated;
+				}
+
+				MYF4 linImp0 = invMassA*linear*rambdaDt;
+				MYF4 linImp1 = invMassB*(-linear)*rambdaDt;
+				MYF4 angImp0 = mtMul1(invInertiaA, angular0)*rambdaDt;
+				MYF4 angImp1 = mtMul1(invInertiaB, angular1)*rambdaDt;
+
+				if( JACOBI )
+				{
+					dLinVelA += linImp0;
+					dAngVelA += angImp0;
+					dLinVelB += linImp1;
+					dAngVelB += angImp1;
+				}
+				else
+				{
+					linVelA += linImp0;
+					angVelA += angImp0;
+					linVelB += linImp1;
+					angVelB += angImp1;
+				}
+			}
+		}
+
+		if( JACOBI )
+		{
+			linVelA += dLinVelA;
+			angVelA += dAngVelA;
+			linVelB += dLinVelB;
+			angVelB += dAngVelB;
+		}
+	}
+
+	enum
+	{
+		N_SPLIT = SolverBase::N_SPLIT,
+	};
+
+	//	for parallel solve
+	struct ParallelSolveData
+	{
+		u32 m_n[N_SPLIT*N_SPLIT];
+		u32 m_offset[N_SPLIT*N_SPLIT];
+	};
+
+	static
+	__inline
+	int sortConstraintByBatch(Contact4* cs, int n, int ignoreIdx, int simdWidth = -1)
+	{
+		SortData* sortData;
+		{
+			BT_PROFILE("new");
+			sortData = new SortData[n];
+		}
+
+		u32* idxBuffer = new u32[n];
+		u32* idxSrc = idxBuffer;
+		u32* idxDst = idxBuffer;
+		int nIdxSrc, nIdxDst;
+
+		const int N_FLG = 256;
+		const int FLG_MASK = N_FLG-1;
+		u32 flg[N_FLG/32];
+#if defined(_DEBUG)
+		for(int i=0; i<n; i++) cs[i].getBatchIdx() = -1; 
+#endif
+		for(int i=0; i<n; i++) idxSrc[i] = i;
+		nIdxSrc = n;
+
+		int batchIdx = 0;
+
+		{
+			BT_PROFILE("batching");
+			while( nIdxSrc )
+			{
+				nIdxDst = 0;
+				int nCurrentBatch = 0;
+
+				//	clear flag
+				for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+
+				for(int i=0; i<nIdxSrc; i++)
+				{
+					int idx = idxSrc[i];
+					ADLASSERT( idx < n );
+					//	check if it can go
+					int aIdx = cs[idx].m_bodyAPtr & FLG_MASK;
+					int bIdx = cs[idx].m_bodyBPtr & FLG_MASK;
+
+					u32 aUnavailable = flg[ aIdx/32 ] & (1<<(aIdx&31));
+					u32 bUnavailable = flg[ bIdx/32 ] & (1<<(bIdx&31));
+
+					aUnavailable = (ignoreIdx==cs[idx].m_bodyAPtr)? 0:aUnavailable;
+					bUnavailable = (ignoreIdx==cs[idx].m_bodyBPtr)? 0:bUnavailable;
+
+					if( aUnavailable==0 && bUnavailable==0 ) // ok 
+					{
+						flg[ aIdx/32 ] |= (1<<(aIdx&31));
+						flg[ bIdx/32 ] |= (1<<(bIdx&31));
+						cs[idx].getBatchIdx() = batchIdx;
+						sortData[idx].m_key = batchIdx;
+						sortData[idx].m_value = idx;
+
+						{
+							nCurrentBatch++;
+							if( nCurrentBatch == simdWidth )
+							{
+								nCurrentBatch = 0;
+								for(int i=0; i<N_FLG/32; i++) flg[i] = 0;
+							}
+						}
+					}
+					else
+					{
+						idxDst[nIdxDst++] = idx;
+					}
+				}
+				swap2( idxSrc, idxDst );
+				swap2( nIdxSrc, nIdxDst );
+				batchIdx ++;
+			}
+		}
+
+		
+
+		{
+			BT_PROFILE("radix sort data");
+			//	sort SortData
+			Device::Config cfg;
+			Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, cfg );
+			{
+				Buffer<SortData> sortBuffer; sortBuffer.setRawPtr( deviceHost, sortData, n );
+				RadixSort<TYPE_HOST>::Data* sort = RadixSort<TYPE_HOST>::allocate( deviceHost, n );
+
+				RadixSort<TYPE_HOST>::execute( sort, sortBuffer, n );
+
+				RadixSort<TYPE_HOST>::deallocate( sort );
+			}
+			DeviceUtils::deallocate( deviceHost );
+		}
+
+		{	
+				BT_PROFILE("reorder");
+			//	reorder
+			Contact4* old = new Contact4[n];
+			memcpy( old, cs, sizeof(Contact4)*n);
+			for(int i=0; i<n; i++)
+			{
+				int idx = sortData[i].m_value;
+				cs[i] = old[idx];
+			}
+			delete [] old;
+		}
+
+		{
+			BT_PROFILE("delete");
+			delete [] idxBuffer;
+			delete [] sortData;
+		}
+#if defined(_DEBUG)
+//		debugPrintf( "nBatches: %d\n", batchIdx );
+		for(int i=0; i<n; i++) ADLASSERT( cs[i].getBatchIdx() != -1 );
+#endif
+		return batchIdx;
+	}
+};
+
+
+
+enum
+{
+//	N_SPLIT = SOLVER_N_SPLIT,
+//	MAX_TASKS_PER_BATCH = N_SPLIT*N_SPLIT/4,
+};
+
+struct SolveTask// : public ThreadPool::Task
+{
+	SolveTask(const Buffer<RigidBodyBase::Body>* bodies, const Buffer<RigidBodyBase::Inertia>* shapes, const Buffer<Constraint4>* constraints,
+		int start, int nConstraints)
+		: m_bodies( bodies ), m_shapes( shapes ), m_constraints( constraints ), m_start( start ), m_nConstraints( nConstraints ),
+		m_solveFriction( true ){}
+
+	u16 getType(){ return 0; }
+
+	void run(int tIdx)
+	{
+		HostBuffer<RigidBodyBase::Body>& hBody = *(HostBuffer<RigidBodyBase::Body>*)m_bodies;
+		HostBuffer<RigidBodyBase::Inertia>& hShape = *(HostBuffer<RigidBodyBase::Inertia>*)m_shapes;
+		HostBuffer<Constraint4>& hc = *(HostBuffer<Constraint4>*)m_constraints;
+
+		for(int ic=0; ic<m_nConstraints; ic++)
+		{
+			int i = m_start + ic;
+
+			float frictionCoeff = hc[i].getFrictionCoeff();
+			int aIdx = (int)hc[i].m_bodyA;
+			int bIdx = (int)hc[i].m_bodyB;
+			RigidBodyBase::Body& bodyA = hBody[aIdx];
+			RigidBodyBase::Body& bodyB = hBody[bIdx];
+
+			if( !m_solveFriction )
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+				SolverInl::solveContact<false>( hc[i], bodyA.m_pos, (MYF4&)bodyA.m_linVel, (MYF4&)bodyA.m_angVel, bodyA.m_invMass, hShape[aIdx].m_invInertia, 
+					bodyB.m_pos, (MYF4&)bodyB.m_linVel, (MYF4&)bodyB.m_angVel, bodyB.m_invMass, hShape[bIdx].m_invInertia,
+					maxRambdaDt, minRambdaDt );
+			}
+			else
+			{
+				float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
+				float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
+
+				float sum = 0;
+				for(int j=0; j<4; j++)
+				{
+					sum +=hc[i].m_appliedRambdaDt[j];
+				}
+				frictionCoeff = 0.7f;
+				for(int j=0; j<4; j++)
+				{
+					maxRambdaDt[j] = frictionCoeff*sum;
+					minRambdaDt[j] = -maxRambdaDt[j];
+				}
+
+				SolverInl::solveFriction( hc[i], bodyA.m_pos, (MYF4&)bodyA.m_linVel, (MYF4&)bodyA.m_angVel, bodyA.m_invMass, hShape[aIdx].m_invInertia, 
+					bodyB.m_pos, (MYF4&)bodyB.m_linVel, (MYF4&)bodyB.m_angVel, bodyB.m_invMass, hShape[bIdx].m_invInertia,
+					maxRambdaDt, minRambdaDt );
+			}
+		}
+	}
+
+	const Buffer<RigidBodyBase::Body>* m_bodies;
+	const Buffer<RigidBodyBase::Inertia>* m_shapes;
+	const Buffer<Constraint4>* m_constraints;
+	int m_start;
+	int m_nConstraints;
+	bool m_solveFriction;
+};
+
+
+template<>
+static Solver<adl::TYPE_HOST>::Data* Solver<adl::TYPE_HOST>::allocate( const Device* device, int pairCapacity )
+{
+	Solver<adl::TYPE_HOST>::Data* data = new Data;
+	data->m_device = device;
+	data->m_parallelSolveData = 0;
+
+	return data;
+}
+
+template<>
+static void Solver<adl::TYPE_HOST>::deallocate( Solver<TYPE_HOST>::Data* data )
+{
+	if( data->m_parallelSolveData ) delete (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+	delete data;
+}
+
+
+void sortContacts2(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+	HostBuffer<RigidBodyBase::Body>* bodyNative 
+		= (HostBuffer<RigidBodyBase::Body>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	HostBuffer<Contact4>* contactNative 
+		= (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contactsIn);
+
+	if( cfg.m_enableParallelSolve )
+	{
+		ADLASSERT( data->m_parallelSolveData == 0 );
+		data->m_parallelSolveData = new SolverInl::ParallelSolveData;
+		SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		HostBuffer<SortData> sortData( data->m_device, nContacts );
+		{	//	2. set cell idx
+			float spacing = adl::SolverBase::N_OBJ_PER_SPLIT*cfg.m_averageExtent;
+			float xScale = 1.f/spacing;
+			for(int i=0; i<nContacts; i++)
+			{
+				int idx = ((*contactNative)[i].m_bodyAPtr==cfg.m_staticIdx)? (*contactNative)[i].m_bodyBPtr:(*contactNative)[i].m_bodyAPtr;
+				float4& p = (*bodyNative)[idx].m_pos;
+				int xIdx = (int)((p.x-((p.x<0.f)?1.f:0.f))*xScale)&(adl::SolverBase::N_SPLIT-1);
+				int zIdx = (int)((p.z-((p.z<0.f)?1.f:0.f))*xScale)&(adl::SolverBase::N_SPLIT-1);
+				ADLASSERT( xIdx >= 0 && xIdx < adl::SolverBase::N_SPLIT );
+				ADLASSERT( zIdx >= 0 && zIdx < adl::SolverBase::N_SPLIT );
+				sortData[i].m_key = (xIdx+zIdx*adl::SolverBase::N_SPLIT);
+				sortData[i].m_value = i;
+			}
+		}
+
+		{	//	3. sort by cell idx
+			RadixSort<TYPE_HOST>::Data* sData = RadixSort<TYPE_HOST>::allocate( data->m_device, nContacts );
+
+			RadixSort<TYPE_HOST>::execute( sData, sortData, nContacts );
+
+			RadixSort<TYPE_HOST>::deallocate( sData );
+		}
+
+		{	//	4. find entries
+			HostBuffer<u32> counts; counts.setRawPtr( data->m_device, solveData->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+			HostBuffer<u32> offsets; offsets.setRawPtr( data->m_device, solveData->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+			{
+				BoundSearch<TYPE_HOST>::Data* sData = BoundSearch<TYPE_HOST>::allocate( data->m_device );
+				PrefixScan<TYPE_HOST>::Data* pData = PrefixScan<TYPE_HOST>::allocate( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+
+				BoundSearch<TYPE_HOST>::execute( sData, sortData, nContacts, counts, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT, BoundSearchBase::COUNT );
+
+				PrefixScan<TYPE_HOST>::execute( pData, counts, offsets, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				
+				BoundSearch<TYPE_HOST>::deallocate( sData );
+				PrefixScan<TYPE_HOST>::deallocate( pData );
+			}
+#if defined(_DEBUG)
+			{
+				HostBuffer<u32> n0( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				HostBuffer<u32> offset0( data->m_device, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					n0[i] = 0;
+					offset0[i] = 0;
+				}
+
+				for(int i=0; i<nContacts; i++)
+				{
+					int idx = sortData[i].m_key;
+					n0[idx]++;
+				}
+
+				//	scan
+				int sum = 0;
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					offset0[i] = sum;
+					sum += n0[i];
+				}
+
+				for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+				{
+					ADLASSERT( n0[i] == counts[i] );
+					ADLASSERT( offset0[i] == offsets[i] );
+				}
+			}
+#endif
+		}
+
+		{	//	5. sort constraints by cellIdx
+			Contact4* old = new Contact4[nContacts];
+			memcpy( old, contactNative->m_ptr, sizeof(Contact4)*nContacts );
+			for(int i=0; i<nContacts; i++)
+			{
+				int srcIdx = sortData[i].m_value;
+				(*contactNative)[i] = old[srcIdx];
+			}
+			delete [] old;
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<true>( contactNative, contactsIn );
+}
+
+static void reorderConvertToConstraints2( Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf,
+	adl::Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	
+	
+	sortContacts2( data, bodyBuf, contactsIn, additionalData, nContacts, cfg );
+
+	{
+		SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+		Buffer<u32> n; n.setRawPtr( data->m_device, solveData->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+		Buffer<u32> offsets; offsets.setRawPtr( data->m_device, solveData->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+		Solver<TYPE_HOST>::batchContacts( data, contactsIn, nContacts, &n, &offsets, cfg.m_staticIdx );
+		printf("hello\n");
+	}
+	
+	Solver<TYPE_HOST>::convertToConstraints( data, bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, cfg );
+}
+
+template<DeviceType TYPE>
+static void solveContactConstraint(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, void* additionalData, int n )
+{
+
+	Buffer<RigidBodyBase::Body>* bodyNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	Buffer<RigidBodyBase::Inertia>* shapeNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, shapeBuf );
+	Buffer<Constraint4>* constraintNative
+		= BufferUtils::map<TYPE_HOST, true>( data->m_device, (const Buffer<Constraint4>*)constraint );
+
+	for(int iter=0; iter<data->m_nIterations; iter++)
+	{
+		SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
+		task.m_solveFriction = false;
+		task.run(0);
+	}
+
+	for(int iter=0; iter<data->m_nIterations; iter++)
+	{
+		SolveTask task( bodyNative, shapeNative, constraintNative, 0, n );
+		task.m_solveFriction = true;
+		task.run(0);
+	}
+
+	BufferUtils::unmap<true>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( shapeNative, shapeBuf );
+	BufferUtils::unmap<false>( constraintNative, (const Buffer<Constraint4>*)constraint );
+}
+
+#if 0
+static
+int createSolveTasks( int batchIdx, Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, int n, ThreadPool::Task* tasksOut[], int taskCapacity )
+{
+/*
+	ADLASSERT( (N_SPLIT&1) == 0 );
+	ADLASSERT( batchIdx < N_BATCHES );
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+	ADLASSERT( data->m_parallelSolveData );
+
+	SolverInl::ParallelSolveData* solveData = (SolverInl::ParallelSolveData*)data->m_parallelSolveData;
+	data->m_batchIdx = 0;
+
+	const int nx = N_SPLIT/2;
+
+	int nTasksCreated = 0;
+
+//	for(int ii=0; ii<2; ii++)
+	for(batchIdx=0; batchIdx<4; batchIdx++)
+	{
+		int2 offset = make_int2( batchIdx&1, batchIdx>>1 );
+		for(int ix=0; ix<nx; ix++) for(int iy=0; iy<nx; iy++)
+		{
+			int xIdx = ix*2 + offset.x;
+			int yIdx = iy*2 + offset.y;
+			int cellIdx = xIdx+yIdx*N_SPLIT;
+
+			int n = solveData->m_n[cellIdx];
+			int start = solveData->m_offset[cellIdx];
+
+			if( n == 0 ) continue;
+
+			SolveTask* task = new SolveTask( bodyBuf, shapeBuf, (const Buffer<Constraint4>*)constraint, start, n );
+//			task->m_solveFriction = (ii==0)? false:true;
+			tasksOut[nTasksCreated++] = task;
+		}
+	}
+
+	return nTasksCreated;
+*/
+	ADLASSERT(0);
+	return 0;
+}
+#endif
+
+
+
+static void convertToConstraints2(  Solver<TYPE_HOST>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const Solver<TYPE_HOST>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+
+	HostBuffer<RigidBodyBase::Body>* bodyNative 
+		= (HostBuffer<RigidBodyBase::Body>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, bodyBuf );
+	HostBuffer<RigidBodyBase::Inertia>* shapeNative 
+		= (HostBuffer<RigidBodyBase::Inertia>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, shapeBuf );
+	HostBuffer<Contact4>* contactNative 
+		= (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contactsIn );
+	HostBuffer<Constraint4>* constraintNative 
+		= (HostBuffer<Constraint4>*)BufferUtils::map<TYPE_HOST, false>( data->m_device, (Buffer<Constraint4>*)contactCOut );
+
+	{
+#if !defined(_DEBUG)
+#pragma omp parallel for
+#endif
+		for(int i=0; i<nContacts; i++)
+		{
+//			new (constraintNative+i)Constraint4;
+			Contact4& contact = (*contactNative)[i];
+
+			if( contact.isInvalid() ) continue;
+
+			int aIdx = (int)contact.m_bodyAPtr;
+			int bIdx = (int)contact.m_bodyBPtr;
+
+			{
+				const RigidBodyBase::Body& bodyA = (*bodyNative)[aIdx];
+				const RigidBodyBase::Body& bodyB = (*bodyNative)[bIdx];
+				MYF4 posA( bodyA.m_pos );
+				MYF4 linVelA( bodyA.m_linVel );
+				MYF4 angVelA( bodyA.m_angVel );
+				MYF4 posB( bodyB.m_pos );
+				MYF4 linVelB( bodyB.m_linVel );
+				MYF4 angVelB( bodyB.m_angVel );
+
+				bool aIsInactive = ( isZero( linVelA ) && isZero( angVelA ) );
+				bool bIsInactive = ( isZero( linVelB ) && isZero( angVelB ) );
+
+				SolverInl::setConstraint4( posA, linVelA, angVelA, 
+					//(*bodyNative)[aIdx].m_invMass, (*shapeNative)[aIdx].m_invInertia,
+					(aIsInactive)? 0.f : (*bodyNative)[aIdx].m_invMass, (aIsInactive)? mtZero() : (*shapeNative)[aIdx].m_invInertia,
+					posB, linVelB, angVelB, 
+					//(*bodyNative)[bIdx].m_invMass, (*shapeNative)[bIdx].m_invInertia, 
+					(bIsInactive)? 0.f : (*bodyNative)[bIdx].m_invMass, (bIsInactive)? mtZero() : (*shapeNative)[bIdx].m_invInertia, 
+					contact, cfg, 
+					(*constraintNative)[i] );
+				(*constraintNative)[i].m_batchIdx = contact.getBatchIdx();
+			}
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( shapeNative, shapeBuf );
+	BufferUtils::unmap<false>( contactNative, contactsIn );
+	BufferUtils::unmap<true>( constraintNative, (Buffer<Constraint4>*)contactCOut );
+}
+
+
+
+
+
+static void batchContacts2(  Solver<TYPE_HOST>::Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_HOST );
+
+	HostBuffer<Contact4>* contactNative =0;
+	HostBuffer<u32>* nNative =0;
+	HostBuffer<u32>* offsetsNative =0;
+
+	int sz = sizeof(Contact4);
+	int sz2 = sizeof(int2);
+	{
+		BT_PROFILE("BufferUtils::map");
+		contactNative  = (HostBuffer<Contact4>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, contacts, nContacts );
+	}
+	{
+		BT_PROFILE("BufferUtils::map2");
+		nNative = (HostBuffer<u32>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, n );
+		offsetsNative= (HostBuffer<u32>*)BufferUtils::map<TYPE_HOST, true>( data->m_device, offsets );
+	}
+
+	
+	{
+		BT_PROFILE("sortConstraintByBatch");
+		int numNonzeroGrid=0;
+		int maxNumBatches = 0;
+
+		for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+		{
+			int n = (*nNative)[i];
+			int offset = (*offsetsNative)[i];
+
+			if( n ) 
+			{
+				numNonzeroGrid++;
+				int numBatches = SolverInl::sortConstraintByBatch( contactNative->m_ptr+offset, n, staticIdx,-1 );	//	on GPU
+				maxNumBatches = max(numBatches,maxNumBatches);
+
+	//			SolverInl::sortConstraintByBatch( contactNative->m_ptr+offset, n, staticIdx );	//	on CPU
+			}
+		}
+
+		printf("maxNumBatches = %d\n", maxNumBatches);
+	}
+
+	{
+		BT_PROFILE("BufferUtils::unmap");
+		BufferUtils::unmap<true>( contactNative, contacts, nContacts );
+	}
+	{
+		BT_PROFILE("BufferUtils::unmap2");
+		BufferUtils::unmap<false>( nNative, n );
+		BufferUtils::unmap<false>( offsetsNative, offsets );
+	}
+
+
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.cl
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/SolverKernels.h
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.cl
@@ -0,0 +1,338 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+#define counter32_t volatile __global int*
+#endif
+
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+#define max2 max
+#define min2 min
+
+
+#define WG_SIZE 64
+
+
+
+typedef struct 
+{
+	float4 m_worldPos[4];
+	float4 m_worldNormal;
+	u32 m_coeffs;
+	int m_batchIdx;
+
+	u32 m_bodyA;
+	u32 m_bodyB;
+}Contact4;
+
+typedef struct 
+{
+	int m_n;
+	int m_start;
+	int m_staticIdx;
+	int m_paddings[1];
+} ConstBuffer;
+
+typedef struct 
+{
+	u32 m_a;
+	u32 m_b;
+	u32 m_idx;
+}Elem;
+
+#define STACK_SIZE (WG_SIZE*10)
+//#define STACK_SIZE (WG_SIZE)
+#define RING_SIZE 1024
+#define RING_SIZE_MASK (RING_SIZE-1)
+#define CHECK_SIZE (WG_SIZE)
+
+
+#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)
+#define RING_END ldsTmp
+
+u32 readBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	return buff[bufIdx] & (1<<bitIdx);
+}
+
+void writeBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+//	buff[bufIdx] |= (1<<bitIdx);
+	atom_or( &buff[bufIdx], (1<<bitIdx) );
+}
+
+u32 tryWrite(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
+	return ((ans >> bitIdx)&1) == 0;
+}
+
+//	batching on the GPU
+__kernel void CreateBatches( __global Contact4* gConstraints, __global Contact4* gConstraintsOut,
+		__global u32* gN, __global u32* gStart, 
+		ConstBuffer cb )
+{
+	__local u32 ldsStackIdx[STACK_SIZE];
+	__local u32 ldsStackEnd;
+	__local Elem ldsRingElem[RING_SIZE];
+	__local u32 ldsRingEnd;
+	__local u32 ldsTmp;
+	__local u32 ldsCheckBuffer[CHECK_SIZE];
+	__local u32 ldsFixedBuffer[CHECK_SIZE];
+	__local u32 ldsGEnd;
+	__local u32 ldsDstEnd;
+
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	
+	const int m_n = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+	const int m_staticIdx = cb.m_staticIdx;
+		
+	if( lIdx == 0 )
+	{
+		ldsRingEnd = 0;
+		ldsGEnd = 0;
+		ldsStackEnd = 0;
+		ldsDstEnd = m_start;
+	}
+	
+//	while(1)
+	for(int ie=0; ie<250; ie++)
+	{
+		ldsFixedBuffer[lIdx] = 0;
+
+		for(int giter=0; giter<4; giter++)
+		{
+			int ringCap = GET_RING_CAPACITY;
+		
+			//	1. fill ring
+			if( ldsGEnd < m_n )
+			{
+				while( ringCap > WG_SIZE )
+				{
+					if( ldsGEnd >= m_n ) break;
+					if( lIdx < ringCap - WG_SIZE )
+					{
+						int srcIdx;
+						AtomInc1( ldsGEnd, srcIdx );
+						if( srcIdx < m_n )
+						{
+							int dstIdx;
+							AtomInc1( ldsRingEnd, dstIdx );
+							
+							int a = gConstraints[m_start+srcIdx].m_bodyA;
+							int b = gConstraints[m_start+srcIdx].m_bodyB;
+							ldsRingElem[dstIdx].m_a = (a>b)? b:a;
+							ldsRingElem[dstIdx].m_b = (a>b)? a:b;
+							ldsRingElem[dstIdx].m_idx = srcIdx;
+						}
+					}
+					ringCap = GET_RING_CAPACITY;
+				}
+			}
+
+			GROUP_LDS_BARRIER;
+	
+			//	2. fill stack
+			__local Elem* dst = ldsRingElem;
+			if( lIdx == 0 ) RING_END = 0;
+
+			int srcIdx=lIdx;
+			int end = ldsRingEnd;
+
+			{
+				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)
+				{
+					Elem e;
+					if(srcIdx<end) e = ldsRingElem[srcIdx];
+					bool done = (srcIdx<end)?false:true;
+
+					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;
+					
+					if( !done )
+					{
+						int aUsed = readBuf( ldsFixedBuffer, e.m_a);
+						int bUsed = readBuf( ldsFixedBuffer, e.m_b);
+
+						if( aUsed==0 && bUsed==0 )
+						{
+							int aAvailable;
+							int bAvailable;
+
+							aAvailable = tryWrite( ldsCheckBuffer, e.m_a );
+							bAvailable = tryWrite( ldsCheckBuffer, e.m_b );
+
+							//aAvailable = (m_staticIdx == e.m_a)? 1: aAvailable;
+							//bAvailable = (m_staticIdx == e.m_b)? 1: bAvailable;
+
+							bool success = (aAvailable && bAvailable);
+							if(success)
+							{
+								writeBuf( ldsFixedBuffer, e.m_a );
+								writeBuf( ldsFixedBuffer, e.m_b );
+							}
+							done = success;
+						}
+					}
+
+					//	put it aside
+					if(srcIdx<end)
+					{
+						if( done )
+						{
+							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );
+							if( dstIdx < STACK_SIZE )
+								ldsStackIdx[dstIdx] = e.m_idx;
+							else{
+								done = false;
+								AtomAdd( ldsStackEnd, -1 );
+							}
+						}
+						if( !done )
+						{
+							int dstIdx; AtomInc1( RING_END, dstIdx );
+							dst[dstIdx] = e;
+						}
+					}
+
+					//	if filled, flush
+					if( ldsStackEnd == STACK_SIZE )
+					{
+						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)
+						{
+							int idx = m_start + ldsStackIdx[i];
+							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+							gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+						}
+						if( lIdx == 0 ) ldsStackEnd = 0;
+
+						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) 
+						ldsFixedBuffer[lIdx] = 0;
+					}
+				}
+			}
+
+			if( lIdx == 0 ) ldsRingEnd = RING_END;
+		}
+
+		GROUP_LDS_BARRIER;
+
+		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)
+		{
+			int idx = m_start + ldsStackIdx[i];
+			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+			gConstraintsOut[ dstIdx ].m_batchIdx = ie;
+		}
+
+		//	in case it couldn't consume any pair. Flush them
+		//	todo. Serial batch worth while?
+		if( ldsStackEnd == 0 )
+		{
+			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)
+			{
+				int idx = m_start + ldsRingElem[i].m_idx;
+				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );
+				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];
+				gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;
+			}
+			GROUP_LDS_BARRIER;
+			if( lIdx == 0 ) ldsRingEnd = 0;
+		}
+
+		if( lIdx == 0 ) ldsStackEnd = 0;
+
+		GROUP_LDS_BARRIER;
+
+		//	termination
+		if( ldsGEnd == m_n && ldsRingEnd == 0 )
+			break;
+	}
+
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/batchingKernels.h
@@ -0,0 +1,371 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+static const char* batchingKernelsCL= \
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"#define counter32_t volatile __global int*\n"
+"#endif\n"
+"\n"
+"\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"\n"
+"\n"
+"#define WG_SIZE 64\n"
+"\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_worldNormal;\n"
+"	u32 m_coeffs;\n"
+"	int m_batchIdx;\n"
+"\n"
+"	u32 m_bodyA;\n"
+"	u32 m_bodyB;\n"
+"}Contact4;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_n;\n"
+"	int m_start;\n"
+"	int m_staticIdx;\n"
+"	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	u32 m_a;\n"
+"	u32 m_b;\n"
+"	u32 m_idx;\n"
+"}Elem;\n"
+"\n"
+"#define STACK_SIZE (WG_SIZE*10)\n"
+"//#define STACK_SIZE (WG_SIZE)\n"
+"#define RING_SIZE 1024\n"
+"#define RING_SIZE_MASK (RING_SIZE-1)\n"
+"#define CHECK_SIZE (WG_SIZE)\n"
+"\n"
+"\n"
+"#define GET_RING_CAPACITY (RING_SIZE - ldsRingEnd)\n"
+"#define RING_END ldsTmp\n"
+"\n"
+"u32 readBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	return buff[bufIdx] & (1<<bitIdx);\n"
+"}\n"
+"\n"
+"void writeBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"//	buff[bufIdx] |= (1<<bitIdx);\n"
+"	atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"}\n"
+"\n"
+"u32 tryWrite(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"	return ((ans >> bitIdx)&1) == 0;\n"
+"}\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_valInt0;\n"
+"	int m_valInt1;\n"
+"	int m_valInt2;\n"
+"	int m_valInt3;\n"
+"\n"
+"	int m_valInt4;\n"
+"	int m_valInt5;\n"
+"	int m_valInt6;\n"
+"	int m_valInt7;\n"
+"\n"
+"	int m_valInt8;\n"
+"	int m_valInt9;\n"
+"	int m_valInt10;\n"
+"	int m_valInt11;\n"
+"	\n"
+"	int	m_valInt12;\n"
+"	int	m_valInt13;\n"
+"	int	m_valInt14;\n"
+"	int	m_valInt15;\n"
+"\n"
+"\n"
+"	float m_fval0;\n"
+"	float m_fval1;\n"
+"	float m_fval2;\n"
+"	float m_fval3;\n"
+"} SolverDebugInfo;\n"
+"\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatches( __global Contact4* gConstraints, __global Contact4* gConstraintsOut, //__global u32* gRes, \n"
+"		__global u32* gN, __global u32* gStart, \n"
+"//		__global SolverDebugInfo* debugInfo, \n"
+"		ConstBuffer cb )\n"
+"{\n"
+"	__local u32 ldsStackIdx[STACK_SIZE];\n"
+"	__local u32 ldsStackEnd;\n"
+"	__local Elem ldsRingElem[RING_SIZE];\n"
+"	__local u32 ldsRingEnd;\n"
+"	__local u32 ldsTmp;\n"
+"	__local u32 ldsCheckBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+"	__local u32 ldsGEnd;\n"
+"	__local u32 ldsDstEnd;\n"
+"\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	\n"
+"	const int m_n = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"	const int m_staticIdx = cb.m_staticIdx;\n"
+"		\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		ldsRingEnd = 0;\n"
+"		ldsGEnd = 0;\n"
+"		ldsStackEnd = 0;\n"
+"		ldsDstEnd = m_start;\n"
+"	}\n"
+"	\n"
+"//	while(1)\n"
+"	for(int ie=0; ie<250; ie++)\n"
+"	{\n"
+"		ldsFixedBuffer[lIdx] = 0;\n"
+"\n"
+"		for(int giter=0; giter<4; giter++)\n"
+"		{\n"
+"			int ringCap = GET_RING_CAPACITY;\n"
+"		\n"
+"			//	1. fill ring\n"
+"			if( ldsGEnd < m_n )\n"
+"			{\n"
+"				while( ringCap > WG_SIZE )\n"
+"				{\n"
+"					if( ldsGEnd >= m_n ) break;\n"
+"					if( lIdx < ringCap - WG_SIZE )\n"
+"					{\n"
+"						int srcIdx;\n"
+"						AtomInc1( ldsGEnd, srcIdx );\n"
+"						if( srcIdx < m_n )\n"
+"						{\n"
+"							int dstIdx;\n"
+"							AtomInc1( ldsRingEnd, dstIdx );\n"
+"							\n"
+"							int a = gConstraints[m_start+srcIdx].m_bodyA;\n"
+"							int b = gConstraints[m_start+srcIdx].m_bodyB;\n"
+"							ldsRingElem[dstIdx].m_a = (a>b)? b:a;\n"
+"							ldsRingElem[dstIdx].m_b = (a>b)? a:b;\n"
+"							ldsRingElem[dstIdx].m_idx = srcIdx;\n"
+"						}\n"
+"					}\n"
+"					ringCap = GET_RING_CAPACITY;\n"
+"				}\n"
+"			}\n"
+"\n"
+"			GROUP_LDS_BARRIER;\n"
+"	\n"
+"			//	2. fill stack\n"
+"			__local Elem* dst = ldsRingElem;\n"
+"			if( lIdx == 0 ) RING_END = 0;\n"
+"\n"
+"			int srcIdx=lIdx;\n"
+"			int end = ldsRingEnd;\n"
+"\n"
+"			{\n"
+"				for(int ii=0; ii<end; ii+=WG_SIZE, srcIdx+=WG_SIZE)\n"
+"				{\n"
+"					Elem e;\n"
+"					if(srcIdx<end) e = ldsRingElem[srcIdx];\n"
+"					bool done = (srcIdx<end)?false:true;\n"
+"\n"
+"					for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) ldsCheckBuffer[lIdx] = 0;\n"
+"					\n"
+"					if( !done )\n"
+"					{\n"
+"						int aUsed = readBuf( ldsFixedBuffer, e.m_a);\n"
+"						int bUsed = readBuf( ldsFixedBuffer, e.m_b);\n"
+"\n"
+"						if( aUsed==0 && bUsed==0 )\n"
+"						{\n"
+"							int aAvailable;\n"
+"							int bAvailable;\n"
+"\n"
+"							aAvailable = tryWrite( ldsCheckBuffer, e.m_a );\n"
+"							bAvailable = tryWrite( ldsCheckBuffer, e.m_b );\n"
+"\n"
+"							//aAvailable = (m_staticIdx == e.m_a)? 1: aAvailable;\n"
+"							//bAvailable = (m_staticIdx == e.m_b)? 1: bAvailable;\n"
+"\n"
+"							bool success = (aAvailable && bAvailable);\n"
+"							if(success)\n"
+"							{\n"
+"								writeBuf( ldsFixedBuffer, e.m_a );\n"
+"								writeBuf( ldsFixedBuffer, e.m_b );\n"
+"							}\n"
+"							done = success;\n"
+"						}\n"
+"					}\n"
+"\n"
+"					//	put it aside\n"
+"					if(srcIdx<end)\n"
+"					{\n"
+"						if( done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( ldsStackEnd, dstIdx );\n"
+"							if( dstIdx < STACK_SIZE )\n"
+"								ldsStackIdx[dstIdx] = e.m_idx;\n"
+"							else{\n"
+"								done = false;\n"
+"								AtomAdd( ldsStackEnd, -1 );\n"
+"							}\n"
+"						}\n"
+"						if( !done )\n"
+"						{\n"
+"							int dstIdx; AtomInc1( RING_END, dstIdx );\n"
+"							dst[dstIdx] = e;\n"
+"						}\n"
+"					}\n"
+"\n"
+"					//	if filled, flush\n"
+"					if( ldsStackEnd == STACK_SIZE )\n"
+"					{\n"
+"						for(int i=lIdx; i<STACK_SIZE; i+=WG_SIZE)\n"
+"						{\n"
+"							int idx = m_start + ldsStackIdx[i];\n"
+"							int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"							gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"							gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"						}\n"
+"						if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"\n"
+"						//for(int i=lIdx; i<CHECK_SIZE; i+=WG_SIZE) \n"
+"						ldsFixedBuffer[lIdx] = 0;\n"
+"					}\n"
+"				}\n"
+"			}\n"
+"\n"
+"			if( lIdx == 0 ) ldsRingEnd = RING_END;\n"
+"		}\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		for(int i=lIdx; i<ldsStackEnd; i+=WG_SIZE)\n"
+"		{\n"
+"			int idx = m_start + ldsStackIdx[i];\n"
+"			int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"			gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"			gConstraintsOut[ dstIdx ].m_batchIdx = ie;\n"
+"		}\n"
+"\n"
+"		//	in case it couldn't consume any pair. Flush them\n"
+"		//	todo. Serial batch worth while?\n"
+"		if( ldsStackEnd == 0 )\n"
+"		{\n"
+"			for(int i=lIdx; i<ldsRingEnd; i+=WG_SIZE)\n"
+"			{\n"
+"				int idx = m_start + ldsRingElem[i].m_idx;\n"
+"				int dstIdx; AtomInc1( ldsDstEnd, dstIdx );\n"
+"				gConstraintsOut[ dstIdx ] = gConstraints[ idx ];\n"
+"				gConstraintsOut[ dstIdx ].m_batchIdx = 100+i;\n"
+"			}\n"
+"			GROUP_LDS_BARRIER;\n"
+"			if( lIdx == 0 ) ldsRingEnd = 0;\n"
+"		}\n"
+"\n"
+"		if( lIdx == 0 ) ldsStackEnd = 0;\n"
+"\n"
+"		GROUP_LDS_BARRIER;\n"
+"\n"
+"		//	termination\n"
+"		if( ldsGEnd == m_n && ldsRingEnd == 0 )\n"
+"			break;\n"
+"	}\n"
+"\n"
+"\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringify.py
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringify.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+import sys
+import os
+import shutil
+
+arg = sys.argv[1]
+fh = open(arg)
+	
+print 'static const char* '+sys.argv[2]+'= \\'
+for line in fh.readlines():
+	a = line.strip('\n')
+	print '"'+a+'\\n"'
+print ';'
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernels.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernels.bat
@@ -0,0 +1,6 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsAll.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsAll.bat
@@ -0,0 +1,10 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+stringify.py SolverKernels.cl solverKernelsCL >SolverKernels.h
+stringify.py batchingKernels.cl batchingKernelsCL >batchingKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsBatching.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsBatching.bat
@@ -0,0 +1,8 @@
+stringify.py batchingKernels.cl batchingKernelsCL >batchingKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsNarrowphase.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsNarrowphase.bat
@@ -0,0 +1,8 @@
+stringify.py ChNarrowphaseKernels.cl narrowphaseKernelsCL >ChNarrowphaseKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsSolver.bat
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/stringifykernelsSolver.bat
@@ -0,0 +1,8 @@
+stringify.py SolverKernels.cl solverKernelsCL >SolverKernels.h
+
+
+
+
+@echo Warning:
+@echo You might still need to find/replace for \\n (due to macros) and replace #include statements by their content
+pause
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/main.cpp
@@ -0,0 +1,77 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "BasicDemo.h"
+#include "GlutStuff.h"
+#include "btBulletDynamicsCommon.h"
+#include "LinearMath/btHashMap.h"
+
+#ifdef CL_PLATFORM_AMD
+#include "../../opencl/basic_initialize/btOpenCLUtils.h"
+extern cl_context			g_cxMainContext;
+extern cl_command_queue	g_cqCommandQue;
+extern cl_device_id		g_clDevice;
+#endif
+
+
+	
+int main(int argc,char** argv)
+{
+
+	#ifdef CL_PLATFORM_AMD
+	int ciErrNum = 0;
+	const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;//CPU;//GPU;
+	
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
+
+	if (numDev>0)
+	{
+		int deviceIndex =0;
+		g_clDevice = btOpenCLUtils::getDevice(g_cxMainContext,deviceIndex);
+		btOpenCLDeviceInfo clInfo;
+		btOpenCLUtils::getDeviceInfo(g_clDevice,clInfo);
+		btOpenCLUtils::printDeviceInfo(g_clDevice);
+		// create a command-queue
+		g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, g_clDevice, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+#endif //#ifdef CL_PLATFORM_AMD
+
+
+	BasicDemo ccdDemo;
+	ccdDemo.initPhysics();
+	
+
+#ifdef CHECK_MEMORY_LEAKS
+	ccdDemo.exitPhysics();
+#else
+	glutmain(argc, argv,1024,600,"Bullet Physics Demo. http://bulletphysics.org",&ccdDemo);
+#endif
+	
+	//setupGUI(1024,768);
+	glutMainLoop();
+	//default glut doesn't return from mainloop
+	return 0;
+}
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/premake4.lua
@@ -0,0 +1,34 @@
+
+-- include "AMD"
+
+if os.is("Windows") then
+	
+		project "basic_bullet2_demo"
+
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../bin"
+
+  		includedirs {
+                ".",
+                "../../bullet2",
+                "../testbed",
+                	"../../rendering/Gwen",
+                }
+		
+
+		links { "testbed",
+			"bullet2",
+			"gwen"
+		}
+		
+		initOpenGL()
+		initGlut()
+	
+		files {
+		"**.cpp",
+		"**.h"
+		}
+
+end
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/DebugCastResult.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/DebugCastResult.h
@@ -0,0 +1,88 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef DEBUG_CAST_RESULT_H
+#define DEBUG_CAST_RESULT_H
+
+#include "BulletCollision/NarrowPhaseCollision/btConvexCast.h"
+#include "LinearMath/btTransform.h"
+#include "GL_ShapeDrawer.h"
+#include "GlutStuff.h"
+#ifdef WIN32
+#include <windows.h>
+#endif
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+#include <GL/gl.h>
+#endif
+struct btDebugCastResult : public btConvexCast::CastResult
+{
+
+	btTransform	m_fromTrans;
+	const btPolyhedralConvexShape* m_shape;
+	btVector3	m_linVel;
+	btVector3 m_angVel;
+	GL_ShapeDrawer* m_shapeDrawer;
+
+	btDebugCastResult(const btTransform& fromTrans,const btPolyhedralConvexShape* shape,
+					const btVector3& linVel,const btVector3& angVel,GL_ShapeDrawer* drawer)
+	:m_fromTrans(fromTrans),
+	m_shape(shape),
+	m_linVel(linVel),
+	m_angVel(angVel),
+	m_shapeDrawer(drawer)
+	{
+	}
+
+	virtual void drawCoordSystem(const btTransform& tr)  
+	{
+		btScalar m[16];
+		tr.getOpenGLMatrix(m);
+		glPushMatrix();
+		btglLoadMatrix(m);
+		glBegin(GL_LINES);
+		btglColor3(1, 0, 0);
+		btglVertex3(0, 0, 0);
+		btglVertex3(1, 0, 0);
+		btglColor3(0, 1, 0);
+		btglVertex3(0, 0, 0);
+		btglVertex3(0, 1, 0);
+		btglColor3(0, 0, 1);
+		btglVertex3(0, 0, 0);
+		btglVertex3(0, 0, 1);
+		glEnd();
+		glPopMatrix();
+	}
+
+	virtual void	DebugDraw(btScalar	fraction)
+	{
+		btVector3 worldBoundsMin(-1000,-1000,-1000);
+		btVector3 worldBoundsMax(1000,1000,1000);
+
+	
+		btScalar m[16];
+		btTransform hitTrans;
+		btTransformUtil::integrateTransform(m_fromTrans,m_linVel,m_angVel,fraction,hitTrans);
+		hitTrans.getOpenGLMatrix(m);
+		if (m_shapeDrawer)
+			m_shapeDrawer->drawOpenGL(m,m_shape,btVector3(1,0,0),btIDebugDraw::DBG_NoDebug,worldBoundsMin,worldBoundsMax);
+	}
+};
+
+
+#endif //DEBUG_CAST_RESULT_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/DemoApplication.h
@@ -0,0 +1,257 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef DEMO_APPLICATION_H
+#define DEMO_APPLICATION_H
+
+
+#include "GlutStuff.h"
+#include "GL_ShapeDrawer.h"
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btMatrix3x3.h"
+#include "LinearMath/btTransform.h"
+#include "LinearMath/btQuickprof.h"
+#include "LinearMath/btAlignedObjectArray.h"
+
+class	btCollisionShape;
+class	btDynamicsWorld;
+class	btRigidBody;
+class	btTypedConstraint;
+
+
+
+class DemoApplication
+{
+protected:
+	void	displayProfileString(int xOffset,int yStart,char* message);
+	class CProfileIterator* m_profileIterator;
+
+	protected:
+#ifdef USE_BT_CLOCK
+	btClock m_clock;
+#endif //USE_BT_CLOCK
+
+	///this is the most important class
+	btDynamicsWorld*		m_dynamicsWorld;
+
+	///constraint for mouse picking
+	btTypedConstraint*		m_pickConstraint;
+
+	virtual void removePickingConstraint();
+
+	btCollisionShape*	m_shootBoxShape;
+
+	float	m_cameraDistance;
+	int	m_debugMode;
+	
+	float m_ele;
+	float m_azi;
+	btVector3 m_cameraPosition;
+	btVector3 m_cameraTargetPosition;//look at
+
+	int	m_mouseOldX;
+	int	m_mouseOldY;
+	int	m_mouseButtons;
+public:
+	int	m_modifierKeys;
+protected:
+
+	float m_scaleBottom;
+	float m_scaleFactor;
+	btVector3 m_cameraUp;
+	int	m_forwardAxis;
+
+	int m_glutScreenWidth;
+	int m_glutScreenHeight;
+
+	float	m_frustumZNear;
+	float	m_frustumZFar;
+
+	int	m_ortho;
+
+	float	m_ShootBoxInitialSpeed;
+	
+	bool	m_stepping;
+	bool m_singleStep;
+	bool m_idle;
+	int m_lastKey;
+
+	virtual float  showProfileInfo(int& xOffset,int& yStart, int yIncr);
+	void renderscene(int pass);
+
+	GL_ShapeDrawer*	m_shapeDrawer;
+	bool			m_enableshadows;
+	btVector3		m_sundirection;
+	btScalar		m_defaultContactProcessingThreshold;
+
+public:
+		
+	DemoApplication();
+	
+	virtual ~DemoApplication();
+
+	btDynamicsWorld*		getDynamicsWorld()
+	{
+		return m_dynamicsWorld;
+	}
+
+	virtual	void initPhysics() = 0;
+
+	virtual	void setDrawClusters(bool drawClusters)
+	{
+
+	}
+
+	void overrideGLShapeDrawer (GL_ShapeDrawer* shapeDrawer);
+	
+	void setOrthographicProjection();
+	void resetPerspectiveProjection();
+	
+	bool	setTexturing(bool enable) { return(m_shapeDrawer->enableTexture(enable)); }
+	bool	setShadows(bool enable)	{ bool p=m_enableshadows;m_enableshadows=enable;return(p); }
+	bool	getTexturing() const
+	{
+		return m_shapeDrawer->hasTextureEnabled();
+	}
+	bool	getShadows() const
+	{
+		return m_enableshadows;
+	}
+
+
+	int		getDebugMode()
+	{
+		return m_debugMode ;
+	}
+	
+	void	setDebugMode(int mode);
+	
+	void	setAzi(float azi)
+	{
+		m_azi = azi;
+	}
+	
+	void	setCameraUp(const btVector3& camUp)
+	{
+		m_cameraUp = camUp;
+	}
+	void	setCameraForwardAxis(int axis)
+	{
+		m_forwardAxis = axis;
+	}
+
+	virtual void myinit();
+
+	void toggleIdle();
+	
+	virtual void updateCamera();
+
+	btVector3	getCameraPosition()
+	{
+		return m_cameraPosition;
+	}
+	btVector3	getCameraTargetPosition()
+	{
+		return m_cameraTargetPosition;
+	}
+
+	btScalar	getDeltaTimeMicroseconds()
+	{
+#ifdef USE_BT_CLOCK
+		btScalar dt = (btScalar)m_clock.getTimeMicroseconds();
+		m_clock.reset();
+		return dt;
+#else
+		return btScalar(16666.);
+#endif
+	}
+	void setFrustumZPlanes(float zNear, float zFar)
+	{
+		m_frustumZNear = zNear;
+		m_frustumZFar = zFar;
+	}
+
+	///glut callbacks
+				
+	float	getCameraDistance();
+	void	setCameraDistance(float dist);	
+	void	moveAndDisplay();
+
+	virtual void clientMoveAndDisplay() = 0;
+
+	virtual void	clientResetScene();
+
+	///Demo functions
+	virtual void setShootBoxShape ();
+	virtual void	shootBox(const btVector3& destination);
+
+
+	btVector3	getRayTo(int x,int y);
+
+	btRigidBody*	localCreateRigidBody(float mass, const btTransform& startTransform,btCollisionShape* shape);
+
+	///callback methods by glut	
+
+	virtual void keyboardCallback(unsigned char key, int x, int y);
+	
+	virtual void keyboardUpCallback(unsigned char key, int x, int y) {}
+	
+	virtual void specialKeyboard(int key, int x, int y){}
+
+	virtual void specialKeyboardUp(int key, int x, int y){}
+
+	virtual void reshape(int w, int h);
+
+	virtual void mouseFunc(int button, int state, int x, int y);
+
+	virtual void	mouseMotionFunc(int x,int y);
+	
+	virtual void displayCallback();
+
+	virtual 	void renderme();
+
+	virtual		void swapBuffers() = 0;
+
+	virtual		void	updateModifierKeys() = 0;
+
+	void stepLeft();
+	void stepRight();
+	void stepFront();
+	void stepBack();
+	void zoomIn();
+	void zoomOut();
+
+	bool	isIdle() const
+	{
+		return	m_idle;
+	}
+
+	void	setIdle(bool idle)
+	{
+		m_idle = idle;
+	}
+
+
+};
+
+#endif //DEMO_APPLICATION_H
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.cpp
@@ -0,0 +1,139 @@
+
+#include "GLDebugDrawer.h"
+#include "GLDebugFont.h"
+#include "GlutStuff.h"
+
+
+
+#include <stdio.h> //printf debugging
+GLDebugDrawer::GLDebugDrawer()
+:m_debugMode(0)
+{
+
+}
+
+void	GLDebugDrawer::drawLine(const btVector3& from,const btVector3& to,const btVector3& fromColor, const btVector3& toColor)
+{
+	glBegin(GL_LINES);
+		glColor3f(fromColor.getX(), fromColor.getY(), fromColor.getZ());
+		glVertex3d(from.getX(), from.getY(), from.getZ());
+		glColor3f(toColor.getX(), toColor.getY(), toColor.getZ());
+		glVertex3d(to.getX(), to.getY(), to.getZ());
+	glEnd();
+}
+
+void	GLDebugDrawer::drawLine(const btVector3& from,const btVector3& to,const btVector3& color)
+{
+	drawLine(from,to,color,color);
+}
+
+void GLDebugDrawer::drawSphere (const btVector3& p, btScalar radius, const btVector3& color)
+{
+	glColor4f (color.getX(), color.getY(), color.getZ(), btScalar(1.0f));
+	glPushMatrix ();
+	glTranslatef (p.getX(), p.getY(), p.getZ());
+
+	int lats = 5;
+	int longs = 5;
+
+	int i, j;
+	for(i = 0; i <= lats; i++) {
+		btScalar lat0 = SIMD_PI * (-btScalar(0.5) + (btScalar) (i - 1) / lats);
+		btScalar z0  = radius*sin(lat0);
+		btScalar zr0 =  radius*cos(lat0);
+
+		btScalar lat1 = SIMD_PI * (-btScalar(0.5) + (btScalar) i / lats);
+		btScalar z1 = radius*sin(lat1);
+		btScalar zr1 = radius*cos(lat1);
+
+		glBegin(GL_QUAD_STRIP);
+		for(j = 0; j <= longs; j++) {
+			btScalar lng = 2 * SIMD_PI * (btScalar) (j - 1) / longs;
+			btScalar x = cos(lng);
+			btScalar y = sin(lng);
+
+			glNormal3f(x * zr0, y * zr0, z0);
+			glVertex3f(x * zr0, y * zr0, z0);
+			glNormal3f(x * zr1, y * zr1, z1);
+			glVertex3f(x * zr1, y * zr1, z1);
+		}
+		glEnd();
+	}
+
+	glPopMatrix();
+}
+
+void GLDebugDrawer::drawBox (const btVector3& boxMin, const btVector3& boxMax, const btVector3& color, btScalar alpha)
+{
+	btVector3 halfExtent = (boxMax - boxMin) * btScalar(0.5f);
+	btVector3 center = (boxMax + boxMin) * btScalar(0.5f);
+	//glEnable(GL_BLEND);     // Turn blending On
+	//glBlendFunc(GL_SRC_ALPHA, GL_ONE);
+	glColor4f (color.getX(), color.getY(), color.getZ(), alpha);
+	glPushMatrix ();
+	glTranslatef (center.getX(), center.getY(), center.getZ());
+	glScaled(2*halfExtent[0], 2*halfExtent[1], 2*halfExtent[2]);
+//	glutSolidCube(1.0);
+	glPopMatrix ();
+	//glDisable(GL_BLEND);
+}
+
+void	GLDebugDrawer::drawTriangle(const btVector3& a,const btVector3& b,const btVector3& c,const btVector3& color,btScalar alpha)
+{
+//	if (m_debugMode > 0)
+	{
+		const btVector3	n=btCross(b-a,c-a).normalized();
+		glBegin(GL_TRIANGLES);		
+		glColor4f(color.getX(), color.getY(), color.getZ(),alpha);
+		glNormal3d(n.getX(),n.getY(),n.getZ());
+		glVertex3d(a.getX(),a.getY(),a.getZ());
+		glVertex3d(b.getX(),b.getY(),b.getZ());
+		glVertex3d(c.getX(),c.getY(),c.getZ());
+		glEnd();
+	}
+}
+
+void	GLDebugDrawer::setDebugMode(int debugMode)
+{
+	m_debugMode = debugMode;
+
+}
+
+void	GLDebugDrawer::draw3dText(const btVector3& location,const char* textString)
+{
+	glRasterPos3f(location.x(),  location.y(),  location.z());
+	//BMF_DrawString(BMF_GetFont(BMF_kHelvetica10),textString);
+}
+
+void	GLDebugDrawer::reportErrorWarning(const char* warningString)
+{
+	printf("%s\n",warningString);
+}
+
+void	GLDebugDrawer::drawContactPoint(const btVector3& pointOnB,const btVector3& normalOnB,btScalar distance,int lifeTime,const btVector3& color)
+{
+	
+	{
+		btVector3 to=pointOnB+normalOnB*1;//distance;
+		const btVector3&from = pointOnB;
+		glColor4f(color.getX(), color.getY(), color.getZ(),1.f);
+		//glColor4f(0,0,0,1.f);
+		glBegin(GL_LINES);
+		glVertex3d(from.getX(), from.getY(), from.getZ());
+		glVertex3d(to.getX(), to.getY(), to.getZ());
+		glEnd();
+
+		
+//		glRasterPos3f(from.x(),  from.y(),  from.z());
+//		char buf[12];
+//		sprintf(buf," %d",lifeTime);
+		//BMF_DrawString(BMF_GetFont(BMF_kHelvetica10),buf);
+
+
+	}
+}
+
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugDrawer.h
@@ -0,0 +1,38 @@
+#ifndef GL_DEBUG_DRAWER_H
+#define GL_DEBUG_DRAWER_H
+
+#include "LinearMath/btIDebugDraw.h"
+
+
+
+class GLDebugDrawer : public btIDebugDraw
+{
+	int m_debugMode;
+
+public:
+
+	GLDebugDrawer();
+
+
+	virtual void	drawLine(const btVector3& from,const btVector3& to,const btVector3& fromColor, const btVector3& toColor);
+
+	virtual void	drawLine(const btVector3& from,const btVector3& to,const btVector3& color);
+
+	virtual void	drawSphere (const btVector3& p, btScalar radius, const btVector3& color);
+	virtual void	drawBox (const btVector3& boxMin, const btVector3& boxMax, const btVector3& color, btScalar alpha);
+
+	virtual void	drawTriangle(const btVector3& a,const btVector3& b,const btVector3& c,const btVector3& color,btScalar alpha);
+	
+	virtual void	drawContactPoint(const btVector3& PointOnB,const btVector3& normalOnB,btScalar distance,int lifeTime,const btVector3& color);
+
+	virtual void	reportErrorWarning(const char* warningString);
+
+	virtual void	draw3dText(const btVector3& location,const char* textString);
+
+	virtual void	setDebugMode(int debugMode);
+
+	virtual int		getDebugMode() const { return m_debugMode;}
+
+};
+
+#endif//GL_DEBUG_DRAWER_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GLDebugFont.h
@@ -0,0 +1,29 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef BT_DEBUG_FONT_H
+#define BT_DEBUG_FONT_H
+
+#include "LinearMath/btVector3.h"
+
+
+void	GLDebugDrawStringInternal(int x,int y,const char* string,const btVector3& rgb, bool enableBlend, int spacing);
+void	GLDebugDrawStringInternal(int x,int y,const char* string,const btVector3& rgb);
+void	GLDebugDrawString(int x,int y,const char* string);
+void	GLDebugResetFont(int screenWidth,int screenHeight);
+
+#endif //BT_DEBUG_FONT_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.cpp
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_ShapeDrawer.h
@@ -0,0 +1,70 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef GL_SHAPE_DRAWER_H
+#define GL_SHAPE_DRAWER_H
+
+class btCollisionShape;
+class btShapeHull;
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btVector3.h"
+
+#include "BulletCollision/CollisionShapes/btShapeHull.h"
+
+/// OpenGL shape drawing
+class GL_ShapeDrawer
+{
+protected:
+	struct ShapeCache
+	{
+	struct Edge { btVector3 n[2];int v[2]; };
+	ShapeCache(btConvexShape* s) : m_shapehull(s) {}
+	btShapeHull					m_shapehull;
+	btAlignedObjectArray<Edge>	m_edges;
+	};
+	//clean-up memory of dynamically created shape hulls
+	btAlignedObjectArray<ShapeCache*>	m_shapecaches;
+	unsigned int						m_texturehandle;
+	bool								m_textureenabled;
+	bool								m_textureinitialized;
+	
+
+	ShapeCache*							cache(btConvexShape*);
+
+public:
+		GL_ShapeDrawer();
+
+		virtual ~GL_ShapeDrawer();
+
+		///drawOpenGL might allocate temporary memoty, stores pointer in shape userpointer
+		virtual void		drawOpenGL(btScalar* m, const btCollisionShape* shape, const btVector3& color,int	debugMode,const btVector3& worldBoundsMin,const btVector3& worldBoundsMax);
+		virtual void		drawShadow(btScalar* m, const btVector3& extrusion,const btCollisionShape* shape,const btVector3& worldBoundsMin,const btVector3& worldBoundsMax);
+		
+		bool		enableTexture(bool enable) { bool p=m_textureenabled;m_textureenabled=enable;return(p); }
+		bool		hasTextureEnabled() const
+		{
+			return m_textureenabled;
+		}
+		
+		static void		drawCylinder(float radius,float halfHeight, int upAxis);
+		void			drawSphere(btScalar r, int lats, int longs);
+		static void		drawCoordSystem();
+		
+};
+
+void OGL_displaylist_register_shape(btCollisionShape * shape);
+void OGL_displaylist_clean();
+
+#endif //GL_SHAPE_DRAWER_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.cpp
@@ -0,0 +1,76 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#include "GL_Simplex1to4.h"
+#include "BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h"
+#include "GL_ShapeDrawer.h"
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+#include <GL/gl.h>
+#endif
+#include "GlutStuff.h"
+#include "LinearMath/btTransform.h"
+
+GL_Simplex1to4::GL_Simplex1to4()
+:m_simplexSolver(0)
+{
+}
+
+///
+/// Debugging method calcClosest calculates the closest point to the origin, using m_simplexSolver
+///
+void	GL_Simplex1to4::calcClosest(btScalar* m)
+{
+	btTransform tr;
+	tr.setFromOpenGLMatrix(m);
+	
+
+
+			GL_ShapeDrawer::drawCoordSystem();
+			
+			if (m_simplexSolver)
+			{
+				m_simplexSolver->reset();
+				bool res;
+
+				btVector3 v;
+
+				for (int i=0;i<m_numVertices;i++)
+				{
+					v =  tr(m_vertices[i]);
+					m_simplexSolver->addVertex(v,v,btVector3(0.f,0.f,0.f));
+					res = m_simplexSolver->closest(v);
+				}
+
+				//draw v?
+				glDisable(GL_LIGHTING);
+				glBegin(GL_LINES);
+				btglColor3(1.f, 0.f, 0.f);
+				btglVertex3(0.f, 0.f, 0.f);
+				btglVertex3(v.x(),v.y(),v.z());
+				glEnd();
+				
+				glEnable(GL_LIGHTING);
+
+
+			}
+
+}
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GL_Simplex1to4.h
@@ -0,0 +1,40 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef GL_SIMPLEX_1TO4_H
+#define GL_SIMPLEX_1TO4_H
+
+#include "BulletCollision/CollisionShapes/btTetrahedronShape.h"
+
+#include "BulletCollision/NarrowPhaseCollision/btSimplexSolverInterface.h"
+
+///GL_Simplex1to4 is a class to debug a Simplex Solver with 1 to 4 points. 
+///Can be used by GJK.
+class GL_Simplex1to4 : public btBU_Simplex1to4
+{
+	btSimplexSolverInterface*	m_simplexSolver;
+
+	public:
+
+	GL_Simplex1to4();
+
+	void	calcClosest(btScalar* m);
+
+	void	setSimplexSolver(btSimplexSolverInterface* simplexSolver) {
+		m_simplexSolver = simplexSolver;
+	}
+
+};
+
+#endif //GL_SIMPLEX_1TO4_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.cpp
@@ -0,0 +1,87 @@
+
+#ifndef _WINDOWS
+
+#include "GlutDemoApplication.h"
+
+#include "GlutStuff.h"
+
+#include "BulletDynamics/Dynamics/btDiscreteDynamicsWorld.h"
+#include "BulletDynamics/Dynamics/btRigidBody.h"
+
+void	GlutDemoApplication::updateModifierKeys()
+{
+	m_modifierKeys = 0;
+	if (glutGetModifiers() & GLUT_ACTIVE_ALT)
+		m_modifierKeys |= BT_ACTIVE_ALT;
+
+	if (glutGetModifiers() & GLUT_ACTIVE_CTRL)
+		m_modifierKeys |= BT_ACTIVE_CTRL;
+	
+	if (glutGetModifiers() & GLUT_ACTIVE_SHIFT)
+		m_modifierKeys |= BT_ACTIVE_SHIFT;
+}
+
+void GlutDemoApplication::specialKeyboard(int key, int x, int y)	
+{
+	(void)x;
+	(void)y;
+
+	switch (key) 
+	{
+	case GLUT_KEY_F1:
+		{
+
+			break;
+		}
+
+	case GLUT_KEY_F2:
+		{
+
+			break;
+		}
+
+
+	case GLUT_KEY_END:
+		{
+			int numObj = getDynamicsWorld()->getNumCollisionObjects();
+			if (numObj)
+			{
+				btCollisionObject* obj = getDynamicsWorld()->getCollisionObjectArray()[numObj-1];
+
+				getDynamicsWorld()->removeCollisionObject(obj);
+				btRigidBody* body = btRigidBody::upcast(obj);
+				if (body && body->getMotionState())
+				{
+					delete body->getMotionState();					
+				}
+				delete obj;
+
+
+			}
+			break;
+		}
+	case GLUT_KEY_LEFT : stepLeft(); break;
+	case GLUT_KEY_RIGHT : stepRight(); break;
+	case GLUT_KEY_UP : stepFront(); break;
+	case GLUT_KEY_DOWN : stepBack(); break;
+	case GLUT_KEY_PAGE_UP : zoomIn(); break;
+	case GLUT_KEY_PAGE_DOWN : zoomOut(); break;
+	case GLUT_KEY_HOME : toggleIdle(); break;
+	default:
+		//        std::cout << "unused (special) key : " << key << std::endl;
+		break;
+	}
+
+	glutPostRedisplay();
+
+}
+
+void GlutDemoApplication::swapBuffers()
+{
+	glutSwapBuffers();
+
+}
+
+#endif //_WINDOWS
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutDemoApplication.h
@@ -0,0 +1,34 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef GLUT_DEMO_APPLICATION_H
+#define GLUT_DEMO_APPLICATION_H
+
+#include "DemoApplication.h"
+
+class GlutDemoApplication : public DemoApplication
+{
+public:
+	
+	void specialKeyboard(int key, int x, int y);
+
+	virtual void swapBuffers();
+
+	virtual	void	updateModifierKeys();
+
+};
+#endif //GLUT_DEMO_APPLICATION_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.cpp
@@ -0,0 +1,119 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _WINDOWS
+
+#include "DemoApplication.h"
+
+//glut is C code, this global gDemoApplication links glut to the C++ demo
+static DemoApplication* gDemoApplication = 0;
+
+
+#include "GlutStuff.h"
+
+static	void glutKeyboardCallback(unsigned char key, int x, int y)
+{
+	gDemoApplication->keyboardCallback(key,x,y);
+}
+
+static	void glutKeyboardUpCallback(unsigned char key, int x, int y)
+{
+  gDemoApplication->keyboardUpCallback(key,x,y);
+}
+
+static void glutSpecialKeyboardCallback(int key, int x, int y)
+{
+	gDemoApplication->specialKeyboard(key,x,y);
+}
+
+static void glutSpecialKeyboardUpCallback(int key, int x, int y)
+{
+	gDemoApplication->specialKeyboardUp(key,x,y);
+}
+
+
+static void glutReshapeCallback(int w, int h)
+{
+	gDemoApplication->reshape(w,h);
+}
+
+static void glutMoveAndDisplayCallback()
+{
+	gDemoApplication->moveAndDisplay();
+}
+
+static void glutMouseFuncCallback(int button, int state, int x, int y)
+{
+	gDemoApplication->mouseFunc(button,state,x,y);
+}
+
+
+static void	glutMotionFuncCallback(int x,int y)
+{
+	gDemoApplication->mouseMotionFunc(x,y);
+}
+
+
+static void glutDisplayCallback(void)
+{
+	gDemoApplication->displayCallback();
+}
+
+
+int glutmain(int argc, char **argv,int width,int height,const char* title,DemoApplication* demoApp) {
+    
+	gDemoApplication = demoApp;
+
+	glutInit(&argc, argv);
+    glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA | GLUT_DEPTH | GLUT_STENCIL);
+    glutInitWindowPosition(0, 0);
+    glutInitWindowSize(width, height);
+    glutCreateWindow(title);
+#ifdef BT_USE_FREEGLUT
+	glutSetOption (GLUT_ACTION_ON_WINDOW_CLOSE, GLUT_ACTION_GLUTMAINLOOP_RETURNS);
+#endif
+
+    gDemoApplication->myinit();
+
+	glutKeyboardFunc(glutKeyboardCallback);
+	glutKeyboardUpFunc(glutKeyboardUpCallback);
+	glutSpecialFunc(glutSpecialKeyboardCallback);
+	glutSpecialUpFunc(glutSpecialKeyboardUpCallback);
+
+	glutReshapeFunc(glutReshapeCallback);
+    //createMenu();
+	glutIdleFunc(glutMoveAndDisplayCallback);
+	glutMouseFunc(glutMouseFuncCallback);
+	glutPassiveMotionFunc(glutMotionFuncCallback);
+	glutMotionFunc(glutMotionFuncCallback);
+	glutDisplayFunc( glutDisplayCallback );
+
+	glutMoveAndDisplayCallback();
+
+//enable vsync to avoid tearing on Apple (todo: for Windows)
+
+#if defined(__APPLE__) && !defined (VMDMESA)
+int swap_interval = 1;
+CGLContextObj cgl_context = CGLGetCurrentContext();
+CGLSetParameter(cgl_context, kCGLCPSwapInterval, &swap_interval);
+#endif
+
+
+	
+    return 0;
+}
+
+
+#endif //_WINDOWS
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/GlutStuff.h
@@ -0,0 +1,86 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2012 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+#ifndef GLUT_STUFF_H
+#define GLUT_STUFF_H
+
+#ifdef _WIN32//for glut.h
+#include <windows.h>
+#endif
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/OpenGL.h>
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#include <GLUT/glut.h>
+#else
+
+
+#ifdef _WINDOWS
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glu.h>
+#else
+#include <GL/gl.h>
+#include <GL/glut.h>
+#endif //_WINDOWS
+#endif //APPLE
+
+#ifdef _WINDOWS
+#define BT_ACTIVE_ALT   VK_LMENU
+
+#else
+#define BT_KEY_K 'k'
+#define BT_KEY_LEFT			GLUT_KEY_LEFT
+#define BT_KEY_RIGHT		GLUT_KEY_RIGHT
+#define BT_KEY_UP			GLUT_KEY_UP
+#define BT_KEY_DOWN			GLUT_KEY_DOWN
+#define	BT_KEY_F1			GLUT_KEY_F1
+#define	BT_KEY_F2			GLUT_KEY_F2
+#define	BT_KEY_F3			GLUT_KEY_F3
+#define	BT_KEY_F4			GLUT_KEY_F4
+#define	BT_KEY_F5			GLUT_KEY_F5
+#define BT_KEY_PAGEUP		GLUT_KEY_PAGE_UP
+#define BT_KEY_PAGEDOWN		GLUT_KEY_PAGE_DOWN
+#define BT_KEY_END			GLUT_KEY_END
+#define BT_KEY_HOME			GLUT_KEY_HOME
+#define BT_ACTIVE_ALT		GLUT_ACTIVE_ALT
+#define	BT_ACTIVE_CTRL		GLUT_ACTIVE_ALT
+#define BT_ACTIVE_SHIFT		GLUT_ACTIVE_SHIFT
+#endif
+
+#if BT_USE_FREEGLUT
+#include "GL/freeglut_ext.h" //to be able to return from glutMainLoop()
+#endif
+
+
+
+class DemoApplication;
+
+int glutmain(int argc, char **argv,int width,int height,const char* title,DemoApplication* demoApp);
+
+#if defined(BT_USE_DOUBLE_PRECISION)
+#define btglLoadMatrix glLoadMatrixd
+#define btglMultMatrix glMultMatrixd
+#define btglColor3 glColor3d
+#define btglVertex3 glVertex3d
+#else
+#define btglLoadMatrix glLoadMatrixf
+#define btglMultMatrix glMultMatrixf
+#define btglColor3 glColor3f
+#define btglVertex3 glVertex3d
+#endif
+
+#endif //GLUT_STUFF_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.cpp
@@ -0,0 +1,86 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "RenderTexture.h"
+#include <memory.h>
+
+
+renderTexture::renderTexture(int width,int height)
+:m_height(height),m_width(width)
+{
+	m_buffer = new unsigned char[m_width*m_height*4];
+	
+	//clear screen
+	memset(m_buffer,0,m_width*m_height*4);
+
+	//clear screen version 2
+	for (int x=0;x<m_width;x++)
+	{
+		for (int y=0;y<m_height;y++)
+		{
+			setPixel(x,y,btVector4(float(x),float(y),0.f,1.f));
+		}
+
+	}
+
+}
+
+void renderTexture::grapicalPrintf(char* str,	void* fontData, int rasterposx,int rasterposy)
+{
+	unsigned char c;
+	int x=0;
+	int xx=0;
+
+	while ((c = (unsigned char) *str++)) {
+		
+		x=xx;		
+		unsigned char* fontPtr = (unsigned char*) fontData;
+		char ch = c-32;
+
+		int sx=ch%16;
+		int sy=ch/16;
+		
+		
+		for (int i=sx*16;i<(sx*16+16);i++)
+		{
+			int y=0;
+			for (int j=sy*16;j<(sy*16+16);j++)
+			{
+				unsigned char packedColor = (fontPtr[i*3+255*256*3-(256*j)*3]);
+				//float colorf = packedColor ? 0.f : 1.f;
+				float colorf = packedColor/255.f;// ? 0.f : 1.f;
+				btVector4 rgba(colorf,colorf,colorf,1.f);
+				//if (colorf)
+				{
+					//setPixel(rasterposx+x,rasterposy+y,rgba);
+					addPixel(rasterposx+x,rasterposy+y,rgba);
+				}
+				//bit >>=1;
+				y++;
+			}
+			x++;
+		}
+		//xx+=16;
+		xx+=10;
+	}
+}
+
+renderTexture::~renderTexture()
+{
+	delete [] m_buffer;
+}
+
+
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/RenderTexture.h
@@ -0,0 +1,73 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2006 Erwin Coumans  http://continuousphysics.com/Bullet/
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef RENDER_TEXTURE_H
+#define RENDER_TEXTURE_H
+
+#include "LinearMath/btVector3.h"
+#include "GLDebugFont.h"
+
+///
+///renderTexture provides a software-render context (setpixel/printf)
+///
+class renderTexture
+{
+	int m_height;
+	int m_width;
+	unsigned char*	m_buffer;
+
+public:
+
+	renderTexture(int width,int height);
+	~renderTexture();
+
+	///rgba input is in range [0..1] for each component
+	inline void	setPixel(int x,int y,const btVector4& rgba)
+	{
+		unsigned char* pixel = &m_buffer[ (x+y*m_width) * 4];
+
+		pixel[0] = (unsigned char)(255.*rgba.getX());
+		pixel[1] = (unsigned char)(255.*rgba.getY());
+		pixel[2] = (unsigned char)(255.*rgba.getZ());
+		pixel[3] = (unsigned char)(255.*rgba.getW());
+	}
+
+	inline void	addPixel(int x,int y,const btVector4& rgba)
+	{
+		unsigned char* pixel = &m_buffer[ (x+y*m_width) * 4];
+		pixel[0] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[0] + btScalar(255.f)*rgba.getX()));
+		pixel[1] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[1] + btScalar(255.f)*rgba.getY()));
+		pixel[2] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[2] + btScalar(255.f)*rgba.getZ()));
+//		pixel[3] = (unsigned char)btMin(btScalar(255.f),((btScalar)pixel[3] + btScalar(255.f)*rgba.getW()));
+	}
+
+	inline btVector4 getPixel(int x,int y)
+	{
+		unsigned char* pixel = &m_buffer[ (x+y*m_width) * 4];
+		return btVector4(pixel[0]*1.f/255.f,
+			pixel[1]*1.f/255.f,
+			pixel[2]*1.f/255.f,
+			pixel[3]*1.f/255.f);
+	}
+
+	const unsigned char*	getBuffer() const { return m_buffer;}
+	int	getWidth() const { return m_width;}
+	int	getHeight() const { return m_height;}
+	void grapicalPrintf(char* str,	void* fontData, int startx = 0,int starty=0);
+
+};
+
+#endif //RENDER_TEXTURE_H
+
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32AppMain.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32AppMain.cpp
@@ -0,0 +1,405 @@
+#ifdef _WINDOWS
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2010 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <windows.h>
+#include <gl/gl.h>
+
+
+#include "DemoApplication.h"
+
+#include "GLDebugDrawer.h"
+#include "GLDebugFont.h"
+
+#include "BulletDynamics/Dynamics/btDynamicsWorld.h"
+
+/// This Win32AppMain is shared code between all demos. 
+/// The actual demo, derived from DemoApplication is created using 'createDemo', in a separate .cpp file
+DemoApplication* gDemoApplication = 0;
+DemoApplication*	createDemo();
+
+
+// Function Declarations
+
+LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam);
+void EnableOpenGL(HWND hWnd, HDC * hDC, HGLRC * hRC);
+void DisableOpenGL(HWND hWnd, HDC hDC, HGLRC hRC);
+static bool sOpenGLInitialized = false;
+static int sWidth = 0;
+static int sHeight =0;
+static int quitRequest = 0;
+
+// WinMain
+
+int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, 
+				   LPSTR lpCmdLine, int iCmdShow)
+{
+	WNDCLASS wc;
+	HWND hWnd;
+	HDC hDC;
+	HGLRC hRC;
+	MSG msg;
+	BOOL quit = FALSE;
+	float theta = 0.0f;
+	
+	gDemoApplication = createDemo();
+	
+
+	// register window class
+	wc.style = CS_OWNDC;
+	wc.lpfnWndProc = WndProc;
+	wc.cbClsExtra = 0;
+	wc.cbWndExtra = 0;
+	wc.hInstance = hInstance;
+	wc.hIcon = LoadIcon( NULL, IDI_APPLICATION );
+	wc.hCursor = LoadCursor( NULL, IDC_ARROW );
+	wc.hbrBackground = (HBRUSH)GetStockObject( BLACK_BRUSH );
+	wc.lpszMenuName = NULL;
+	wc.lpszClassName = "BulletPhysics";
+	RegisterClass( &wc );
+	
+	// create main window
+	hWnd = CreateWindow( 
+		"BulletPhysics", "Bullet Physics Sample. http://bulletphysics.org", 
+		WS_CAPTION | WS_VISIBLE | WS_OVERLAPPEDWINDOW,
+//		0, 0, 640, 480,
+		0, 0, 1024, 768,
+		NULL, NULL, hInstance, NULL );
+	
+	// enable OpenGL for the window
+	EnableOpenGL( hWnd, &hDC, &hRC );
+	
+	
+	GLDebugDrawer debugDraw;
+	gDemoApplication->myinit();
+	//gDemoApplication->reshape(1024, 768);
+	gDemoApplication->initPhysics();
+	if (gDemoApplication->getDynamicsWorld())
+		gDemoApplication->getDynamicsWorld()->setDebugDrawer(&debugDraw);
+	
+	gDemoApplication->reshape(sWidth,sHeight);
+
+	// program main loop
+	while ( !quit )
+	{
+		
+		// check for messages
+		if ( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE )  )
+		{
+			
+			// handle or dispatch messages
+			if ( msg.message == WM_QUIT ) 
+			{
+				quit = TRUE;
+			} 
+			else 
+			{
+				TranslateMessage( &msg );
+				DispatchMessage( &msg );
+			}
+			
+//			gDemoApplication->displayCallback();
+			
+
+		};
+		
+		// OpenGL animation code goes here
+		
+		glClearColor( .7f, 0.7f, 0.7f, 1.f );
+		
+		gDemoApplication->moveAndDisplay();
+
+
+		SwapBuffers( hDC );
+		
+		theta += 1.0f;
+	
+		
+	}
+	
+
+
+	// shutdown OpenGL
+	DisableOpenGL( hWnd, hDC, hRC );
+	
+	// destroy the window explicitly
+	DestroyWindow( hWnd );
+
+	delete gDemoApplication;
+
+	return msg.wParam;
+	
+}
+
+// Window Procedure
+
+LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	
+	
+
+	switch (message)
+	{
+
+	case WM_SYSKEYDOWN:
+		{
+			if (lParam & 1<<29)
+			{
+				gDemoApplication->m_modifierKeys = VK_LMENU;
+			}
+			break;
+		}
+	case WM_SYSKEYUP:
+		{
+			if (lParam & 1<<29)
+			{
+				gDemoApplication->m_modifierKeys = VK_LMENU;
+			} else
+			{
+				gDemoApplication->m_modifierKeys = 0;
+			}
+			
+			break;
+		}
+
+		
+		case WM_SIZE:													// Size Action Has Taken Place
+
+			switch (wParam)												// Evaluate Size Action
+			{
+				case SIZE_MINIMIZED:									// Was Window Minimized?
+				return 0;												// Return
+
+				case SIZE_MAXIMIZED:									// Was Window Maximized?
+					sWidth = LOWORD (lParam);
+					sHeight = HIWORD (lParam);
+					if (sOpenGLInitialized)
+					{
+						gDemoApplication->reshape(sWidth,sHeight);
+					}
+				return 0;												// Return
+
+				case SIZE_RESTORED:										// Was Window Restored?
+					sWidth = LOWORD (lParam);
+					sHeight = HIWORD (lParam);
+					if (sOpenGLInitialized)
+					{
+						gDemoApplication->reshape(sWidth,sHeight);
+					}
+				return 0;												// Return
+			}
+		break;	
+
+	case WM_CREATE:
+		return 0;
+	
+	case WM_MBUTTONUP:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(1,1,xPos,yPos);
+		break;
+	}
+	case WM_MBUTTONDOWN:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(1,0,xPos,yPos);
+		break;
+	}
+
+	case WM_LBUTTONUP:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(0,1,xPos,yPos);
+		break;
+	}
+	case 0x020A://WM_MOUSEWHEEL:
+	{
+
+		int  zDelta = (short)HIWORD(wParam);
+		int xPos = LOWORD(lParam); 
+		int yPos = HIWORD(lParam); 
+		if (zDelta>0)
+			gDemoApplication->zoomIn();
+		else
+			gDemoApplication->zoomOut();
+		break;
+	}
+
+	case WM_MOUSEMOVE:
+		{
+				int xPos = LOWORD(lParam); 
+				int yPos = HIWORD(lParam); 
+				gDemoApplication->mouseMotionFunc(xPos,yPos);
+			break;
+		}
+	case WM_RBUTTONUP:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(2,1,xPos,yPos);
+		break;
+	}
+	case WM_RBUTTONDOWN:
+	{
+			int xPos = LOWORD(lParam); 
+			int yPos = HIWORD(lParam); 
+			gDemoApplication->mouseFunc(2,0,xPos,yPos);
+		break;
+	}
+	case WM_LBUTTONDOWN:
+		{
+				int xPos = LOWORD(lParam); 
+				int yPos = HIWORD(lParam); 
+				gDemoApplication->mouseFunc(0,0,xPos,yPos);
+			break;
+		}
+/*#define WM_LBUTTONUP                    0x0202
+#define WM_LBUTTONDBLCLK                0x0203
+#define WM_RBUTTONDOWN                  0x0204
+#define WM_RBUTTONUP                    0x0205
+#define WM_RBUTTONDBLCLK                0x0206
+#define WM_MBUTTONDOWN                  0x0207
+#define WM_MBUTTONUP                    0x0208
+#define WM_MBUTTONDBLCLK                0x0209
+*/
+
+
+
+	case WM_CLOSE:
+		PostQuitMessage( 0 );
+		return 0;
+		
+	case WM_DESTROY:
+		return 0;
+		
+	case WM_KEYUP:
+		switch ( wParam )
+		{
+			
+		case VK_PRIOR:
+		case VK_NEXT:
+		case VK_END:
+		case VK_HOME:
+		case VK_LEFT:
+		case VK_UP:
+		case VK_RIGHT:
+		case VK_DOWN:
+			{
+				if (gDemoApplication)
+					gDemoApplication->specialKeyboardUp(wParam,0,0);
+				return 0;
+			}
+			default:
+				{
+					gDemoApplication->keyboardUpCallback(tolower(wParam),0,0);
+				}
+			return DefWindowProc( hWnd, message, wParam, lParam );
+		}
+
+	case WM_KEYDOWN:
+		printf("bla\n");
+		switch ( wParam )
+		{
+		case VK_CONTROL:
+		case VK_PRIOR:
+		case VK_NEXT:
+		case VK_END:
+		case VK_HOME:
+		case VK_LEFT:
+		case VK_UP:
+		case VK_RIGHT:
+		case VK_DOWN:
+			{
+				if (gDemoApplication)
+					gDemoApplication->specialKeyboard(wParam,0,0);
+				break;
+			}
+
+		case ' ':
+			{
+				if (gDemoApplication)
+					gDemoApplication->clientResetScene();
+				break;
+			}
+		case 'Q':
+		case VK_ESCAPE:
+			{
+				quitRequest = 1;
+				PostQuitMessage(0);
+			}
+			return 0;
+			
+		}
+		return 0;
+		
+	case WM_CHAR:
+		if (!quitRequest)
+			gDemoApplication->keyboardCallback(wParam,0,0);
+		break;
+	
+	default:
+		return DefWindowProc( hWnd, message, wParam, lParam );
+			
+	}
+	return 0;
+}
+
+// Enable OpenGL
+
+void EnableOpenGL(HWND hWnd, HDC * hDC, HGLRC * hRC)
+{
+	PIXELFORMATDESCRIPTOR pfd;
+	int format;
+	
+	// get the device context (DC)
+	*hDC = GetDC( hWnd );
+	
+	// set the pixel format for the DC
+	ZeroMemory( &pfd, sizeof( pfd ) );
+	pfd.nSize = sizeof( pfd );
+	pfd.nVersion = 1;
+	pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
+	pfd.iPixelType = PFD_TYPE_RGBA;
+	pfd.cColorBits = 24;
+	pfd.cDepthBits = 16;
+	pfd.cStencilBits = 1;
+	pfd.iLayerType = PFD_MAIN_PLANE;
+	format = ChoosePixelFormat( *hDC, &pfd );
+	SetPixelFormat( *hDC, format, &pfd );
+	
+	// create and enable the render context (RC)
+	*hRC = wglCreateContext( *hDC );
+	wglMakeCurrent( *hDC, *hRC );
+	sOpenGLInitialized = true;
+	
+	
+}
+
+// Disable OpenGL
+
+void DisableOpenGL(HWND hWnd, HDC hDC, HGLRC hRC)
+{
+	sOpenGLInitialized = false;
+
+	wglMakeCurrent( NULL, NULL );
+	wglDeleteContext( hRC );
+	ReleaseDC( hWnd, hDC );
+}
+
+#endif //_WINDOWS
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.cpp
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.cpp
@@ -0,0 +1,79 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifdef _WINDOWS
+
+#include "Win32DemoApplication.h"
+
+
+
+
+#if 0
+void	Win32DemoApplication::renderme()
+{
+}
+void	Win32DemoApplication::setTexturing(bool useTexture)
+{
+}
+	
+void	Win32DemoApplication::setShadows(bool useShadows)
+{
+}
+	
+void	Win32DemoApplication::setCameraDistance(float camDist)
+{
+}
+void	Win32DemoApplication::clientResetScene()
+{
+
+}
+#endif
+
+void Win32DemoApplication::updateModifierKeys()
+{
+	//not yet
+}
+
+
+
+void Win32DemoApplication::specialKeyboard(int key, int x, int y)	
+{
+	(void)x;
+	(void)y;
+
+	switch (key) 
+	{
+	case VK_LEFT : stepLeft(); break;
+	case VK_RIGHT : stepRight(); break;
+	case VK_UP : stepFront(); break;
+	case VK_DOWN : stepBack(); break;
+
+//	case GLUT_KEY_PAGE_UP : zoomIn(); break;
+//	case GLUT_KEY_PAGE_DOWN : zoomOut(); break;
+//	case GLUT_KEY_HOME : toggleIdle(); break;
+
+	default:
+		//        std::cout << "unused (special) key : " << key << std::endl;
+		break;
+	}
+
+}
+
+void	Win32DemoApplication::swapBuffers()
+{
+}
+	
+#endif
+	
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.h
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/Win32DemoApplication.h
@@ -0,0 +1,40 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2009 Erwin Coumans  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#ifndef WIN32_DEMO_APPLICATION_H
+#define WIN32_DEMO_APPLICATION_H
+
+
+#include "DemoApplication.h"
+
+class Win32DemoApplication : public DemoApplication
+{
+protected:
+
+
+public:
+
+	
+	virtual void	swapBuffers();
+		
+	void specialKeyboard(int key, int x, int y);
+
+	virtual		void	updateModifierKeys();
+
+	
+};
+
+#endif //WIN32_DEMO_APPLICATION_H
--- a/Extras/RigidBodyGpuPipeline/dynamics/testbed/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/dynamics/testbed/premake4.lua
@@ -0,0 +1,18 @@
+	project "testbed"
+		
+	kind "StaticLib"
+	targetdir "../../build/lib"	
+	includedirs {
+		".",
+		"../../bullet2"
+	}
+	configuration {"Windows"}
+	includedirs {
+		"../../rendering/GlutGlewWindows"
+	}
+	configuration{}
+
+	files {
+		"**.cpp",
+		"**.h"
+	}