Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/AMD/premake4.lua
@@ -0,0 +1,29 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_bt3dGridBroadphase_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "StaticLib"
+		targetdir "../../../bin"
+
+		libdirs {"../../../rendering/GlutGlewWindows"}
+
+			includedirs {
+--		"../../../rendering/GlutGlewWindows",
+		"../../../opencl/3dGridBroadphase/Shared",
+		"../../../../../src",
+		"../../primitives"
+		}
+		
+		files {
+			"../Shared/*.cpp",
+			"../Shared/*.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/MiniCL/MiniCLTaskWrap.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/MiniCL/MiniCLTaskWrap.cpp
@@ -0,0 +1,23 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2003-2007 Erwin Coumans  http://bulletphysics.com
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include <MiniCL/cl_MiniCL_Defs.h>
+
+extern "C"
+{
+	#define MSTRINGIFY(A) A
+	#include "bt3dGridBroadphaseOCL.cl"
+	#undef MSTRINGIFY
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cl
@@ -0,0 +1,349 @@
+
+MSTRINGIFY(
+
+int getPosHash(int4 gridPos, __global float4* pParams)
+{
+	int4 gridDim = *((__global int4*)(pParams + 1));
+	gridPos.x &= gridDim.x - 1;
+	gridPos.y &= gridDim.y - 1;
+	gridPos.z &= gridDim.z - 1;
+	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
+	return hash;
+} 
+
+int4 getGridPos(float4 worldPos, __global float4* pParams)
+{
+    int4 gridPos;
+	int4 gridDim = *((__global int4*)(pParams + 1));
+    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);
+    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);
+    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);
+    return gridPos;
+}
+
+
+// calculate grid hash value for each body using its AABB
+__kernel void kCalcHashAABB(int numObjects, __global float4* pAABB, __global int2* pHash, __global float4* pParams GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index*2];
+	float4 bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+	pos.w = 0.f;
+    // get address in grid
+    int4 gridPos = getGridPos(pos, pParams);
+    int gridHash = getPosHash(gridPos, pParams);
+    // store grid hash and body index
+    int2 hashVal;
+    hashVal.x = gridHash;
+    hashVal.y = index;
+    pHash[index] = hashVal;
+}
+
+__kernel void kClearCellStart(	int numCells, 
+								__global int* pCellStart GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numCells)
+	{
+		return;
+	}
+	pCellStart[index] = -1;
+}
+
+__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart GUID_ARG)
+{
+	__local int sharedHash[513];
+    int index = get_global_id(0);
+	int2 sortedData;
+    if(index < numObjects)
+	{
+		sortedData = pHash[index];
+		// Load hash data into shared memory so that we can look 
+		// at neighboring body's hash value without loading
+		// two hash values per thread
+		sharedHash[get_local_id(0) + 1] = sortedData.x;
+		if((index > 0) && (get_local_id(0) == 0))
+		{
+			// first thread in block must load neighbor body hash
+			sharedHash[0] = pHash[index-1].x;
+		}
+	}
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(index < numObjects)
+	{
+		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
+		{
+			cellStart[sortedData.x] = index;
+		}
+	}
+}
+
+int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
+{
+	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && 
+			(min0.y <= max1.y)&& (min1.y <= max0.y) && 
+			(min0.z <= max1.z)&& (min1.z <= max0.z); 
+}
+
+
+
+
+
+void findPairsInCell(	int numObjects,
+						int4	gridPos,
+						int    index,
+						__global int2*  pHash,
+						__global int*   pCellStart,
+						__global float4* pAABB, 
+						__global int*   pPairBuff,
+						__global int2*	pPairBuffStartCurr,
+						__global float4* pParams)
+{
+	int4 pGridDim = *((__global int4*)(pParams + 1));
+	int maxBodiesPerCell = pGridDim.w;
+    int gridHash = getPosHash(gridPos, pParams);
+    // get start of bucket for this cell
+    int bucketStart = pCellStart[gridHash];
+    if (bucketStart == -1)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+    float4 min0 = pAABB[unsorted_indx*2 + 0]; 
+	float4 max0 = pAABB[unsorted_indx*2 + 1];
+	int handleIndex =  as_int(min0.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	int curr_max = start_curr_next.x - start - 1;
+	int bucketEnd = bucketStart + maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;
+	for(int index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        int2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		int unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			float4 min1 = pAABB[unsorted_indx2*2 + 0];
+			float4 max1 = pAABB[unsorted_indx2*2 + 1];
+			if(testAABBOverlap(min0, max0, min1, max1))
+			{
+				int handleIndex2 = as_int(min1.w);
+				int k;
+				for(k = 0; k < curr; k++)
+				{
+					int old_pair = pPairBuff[start+k] & (~0x60000000);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= 0x40000000;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					pPairBuff[start+curr] = handleIndex2 | 0x20000000;
+					curr++;
+				}
+			}
+		}
+	}
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = curr;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+    return;
+}
+
+__kernel void kFindOverlappingPairs(	int numObjects,
+										__global float4* pAABB, 
+										__global int2* pHash, 
+										__global int* pCellStart, 
+										__global int* pPairBuff, 
+										__global int2* pPairBuffStartCurr, 
+										__global float4* pParams GUID_ARG)
+
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+	float4 bbMin = pAABB[unsorted_indx*2 + 0];
+	float4 bbMax = pAABB[unsorted_indx*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+    // get address in grid
+    int4 gridPosA = getGridPos(pos, pParams);
+    int4 gridPosB; 
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) 
+    {
+		gridPosB.z = gridPosA.z + z;
+        for(int y=-1; y<=1; y++) 
+        {
+			gridPosB.y = gridPosA.y + y;
+            for(int x=-1; x<=1; x++) 
+            {
+				gridPosB.x = gridPosA.x + x;
+                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams);
+            }
+        }
+    }
+}
+
+
+__kernel void kFindPairsLarge(	int numObjects, 
+								__global float4* pAABB, 
+								__global int2* pHash, 
+								__global int* pCellStart, 
+								__global int* pPairBuff, 
+								__global int2* pPairBuffStartCurr, 
+								uint numLarge GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+	float4 min0 = pAABB[unsorted_indx*2 + 0];
+	float4 max0 = pAABB[unsorted_indx*2 + 1];
+	int handleIndex =  as_int(min0.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	int curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		int indx2 = numObjects + i;
+		float4 min1 = pAABB[indx2*2 + 0];
+		float4 max1 = pAABB[indx2*2 + 1];
+		if(testAABBOverlap(min0, max0, min1, max1))
+		{
+			int k;
+			int handleIndex2 =  as_int(min1.w);
+			for(k = 0; k < curr; k++)
+			{
+				int old_pair = pPairBuff[start+k] & (~0x60000000);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= 0x40000000;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | 0x20000000;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = curr;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+    return;
+}
+
+__kernel void kComputePairCacheChanges(	int numObjects,
+										__global int* pPairBuff, 
+										__global int2* pPairBuffStartCurr, 
+										__global int* pPairScan, 
+										__global float4* pAABB GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int *pInp = pPairBuff + start;
+	int num_changes = 0;
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & 0x40000000))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} 
+
+__kernel void kSqueezeOverlappingPairBuff(	int numObjects,
+											__global int* pPairBuff, 
+											__global int2* pPairBuffStartCurr, 
+											__global int* pPairScan,
+											__global int* pPairOut, 
+											__global float4* pAABB GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int* pInp = pPairBuff + start;
+	__global int* pOut = pPairOut + pPairScan[index+1];
+	__global int* pOut2 = pInp;
+	int num = 0; 
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & 0x40000000))
+		{
+			*pOut = *pInp;
+			pOut++;
+		}
+		if((*pInp) & 0x60000000)
+		{
+			*pOut2 = (*pInp) & (~0x60000000);
+			pOut2++;
+			num++;
+		}
+	}
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = num;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+}
+
+
+
+
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
@@ -0,0 +1,697 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "../basic_initialize/btOpenCLUtils.h"
+
+#include "bt3dGridBroadphaseOCL.h"
+
+#include <stdio.h>
+#include <string.h>
+#include "Adl/Adl.h"
+#include <AdlPrimitives/Scan/PrefixScan.h>
+#include <AdlPrimitives/Sort/RadixSort32.h>
+#include <AdlPrimitives/Sort/RadixSort.h>
+
+#define ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+
+#define GRID_OCL_PATH "..\\..\\opencl\\3dGridBroadphase\\Shared\\bt3dGridBroadphaseOCL.cl"
+
+
+#define MSTRINGIFY(A) #A
+
+static const char* spProgramSource = 
+#include "bt3dGridBroadphaseOCL.cl"
+
+adl::PrefixScan<adl::TYPE_CL>::Data* gData1=0;
+adl::Buffer<unsigned int>* m_srcClBuffer=0;
+
+struct MySortData
+{
+	int key;
+	int value;
+};
+
+adl::RadixSort32<adl::TYPE_CL>::Data* dataC = 0;
+adl::RadixSort<adl::TYPE_HOST>::Data* dataHost = 0;
+
+
+static unsigned int infElem = 0x2fffffff;
+
+static unsigned int zeroEl = 0;
+static unsigned int minusOne= -1;
+
+
+bt3dGridBroadphaseOCL::bt3dGridBroadphaseOCL(	btOverlappingPairCache* overlappingPairCache,
+												const btVector3& cellSize, 
+												int gridSizeX, int gridSizeY, int gridSizeZ, 
+												int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+												btScalar maxSmallProxySize,
+												int maxSmallProxiesPerCell,
+												cl_context context, cl_device_id device, cl_command_queue queue,
+												adl::DeviceCL* deviceCL
+												) : 
+	btGpu3DGridBroadphase(overlappingPairCache, cellSize, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerSmallProxy, maxSmallProxySize, maxSmallProxiesPerCell)
+{
+
+
+	initCL(context, device, queue);
+	allocateBuffers();
+	
+	prefillBuffers();
+
+	initKernels();
+
+	//create an Adl device host and OpenCL device
+
+	adl::DeviceUtils::Config cfg;
+	m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
+	m_ownsDevice = false;
+	if (!deviceCL)
+	{
+		m_ownsDevice = true;
+		deviceCL = new adl::DeviceCL;
+		deviceCL->m_context = context;
+		deviceCL->m_deviceIdx = device;
+		deviceCL->m_commandQueue = queue;
+		deviceCL->m_kernelManager = new adl::KernelManager;
+	}
+
+	m_deviceCL = deviceCL;
+
+	int minSize = 256*1024;
+	int maxSortBuffer = maxSmallProxies < minSize ? minSize :maxSmallProxies;
+
+	m_srcClBuffer = new adl::Buffer<unsigned int> (m_deviceCL,maxSmallProxies+2);
+	m_srcClBuffer->write(&zeroEl,1,0);
+
+	//m_srcClBuffer->write(&infElem,maxSmallProxies,0);
+	m_srcClBuffer->write(&infElem,1,maxSmallProxies);
+	m_srcClBuffer->write(&zeroEl,1,maxSmallProxies+1);
+	m_deviceCL->waitForCompletion();
+	
+	gData1 = adl::PrefixScan<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2,adl::PrefixScanBase::EXCLUSIVE );
+	dataHost = adl::RadixSort<adl::TYPE_HOST>::allocate( m_deviceHost, maxSmallProxies+2 );
+	dataC = adl::RadixSort32<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2 );
+	
+}
+
+
+
+bt3dGridBroadphaseOCL::~bt3dGridBroadphaseOCL()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	adl::RadixSort<adl::TYPE_HOST>::deallocate(dataHost);
+	adl::PrefixScan<adl::TYPE_CL>::deallocate(gData1);
+	adl::RadixSort32<adl::TYPE_CL>::deallocate(dataC);
+	adl::DeviceUtils::deallocate(m_deviceHost);
+	delete m_srcClBuffer;
+	if (m_ownsDevice)
+	{
+		delete m_deviceCL->m_kernelManager;
+		delete m_deviceCL;
+	}
+}
+
+#ifdef CL_PLATFORM_MINI_CL
+// there is a problem with MSVC9 : static constructors are not called if variables defined in library and are not used
+// looks like it is because of optimization
+// probably this will happen with other compilers as well
+// so to make it robust, register kernels again (it is safe)
+#define MINICL_DECLARE(a) extern "C" void a();
+MINICL_DECLARE(kCalcHashAABB)
+MINICL_DECLARE(kClearCellStart)
+MINICL_DECLARE(kFindCellStart)
+MINICL_DECLARE(kFindOverlappingPairs)
+MINICL_DECLARE(kFindPairsLarge)
+MINICL_DECLARE(kComputePairCacheChanges)
+MINICL_DECLARE(kSqueezeOverlappingPairBuff)
+#undef MINICL_DECLARE
+#endif
+
+void bt3dGridBroadphaseOCL::initCL(cl_context context, cl_device_id device, cl_command_queue queue)
+{
+
+	#ifdef CL_PLATFORM_MINI_CL
+		// call constructors here
+		MINICL_REGISTER(kCalcHashAABB)
+		MINICL_REGISTER(kClearCellStart)
+		MINICL_REGISTER(kFindCellStart)
+		MINICL_REGISTER(kFindOverlappingPairs)
+		MINICL_REGISTER(kFindPairsLarge)
+		MINICL_REGISTER(kComputePairCacheChanges)
+		MINICL_REGISTER(kSqueezeOverlappingPairBuff)
+	#endif
+
+	cl_int ciErrNum;
+
+	btAssert(context);
+	m_cxMainContext = context;
+	btAssert(device);
+	m_cdDevice = device;
+	btAssert(queue);
+	m_cqCommandQue = queue;
+	
+	//adl::Kernel kern = m_deviceCL->getKernel(fileName,funcName,options,src);
+	
+	m_cpProgram = btOpenCLUtils::compileCLProgramFromString(m_cxMainContext,m_cdDevice,spProgramSource, &ciErrNum,"-DGUID_ARG=""""",GRID_OCL_PATH);
+	
+	printf("OK\n");
+}
+
+
+void bt3dGridBroadphaseOCL::initKernels()
+{
+	initKernel(GRID3DOCL_KERNEL_CALC_HASH_AABB,	"kCalcHashAABB");
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 3, sizeof(cl_mem),(void*)&m_dBpParams);
+
+	initKernel(GRID3DOCL_KERNEL_CLEAR_CELL_START, "kClearCellStart");
+	setKernelArg(GRID3DOCL_KERNEL_CLEAR_CELL_START, 1, sizeof(cl_mem),(void*)&m_dCellStart);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_CELL_START, "kFindCellStart");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 1, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 2, sizeof(cl_mem),(void*)&m_dCellStart);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, "kFindOverlappingPairs");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 3, sizeof(cl_mem),(void*)&m_dCellStart);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 6, sizeof(cl_mem),(void*)&m_dBpParams);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, "kFindPairsLarge");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 3, sizeof(cl_mem),(void*)&m_dCellStart);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+
+	initKernel(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, "kComputePairCacheChanges");
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 4, sizeof(cl_mem),(void*)&m_dAABB);
+
+	initKernel(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, "kSqueezeOverlappingPairBuff");
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 4, sizeof(cl_mem),(void*)&m_dPairsChanged);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 5, sizeof(cl_mem),(void*)&m_dAABB);
+
+}
+
+
+void bt3dGridBroadphaseOCL::allocateBuffers()
+{
+    cl_int ciErrNum;
+    unsigned int memSize;
+	// current version of bitonic sort works for power of 2 arrays only, so ...
+	m_hashSize = 1;
+	for(int bit = 1; bit < 32; bit++)
+	{
+		if(m_hashSize >= m_maxHandles)
+		{
+			break;
+		}
+		m_hashSize <<= 1;
+	}
+	memSize = m_hashSize * 2 * sizeof(unsigned int);
+	if (memSize < 1024*1024)
+		memSize = 1024*1024;
+
+	m_dBodiesHash = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_numCells * sizeof(unsigned int);
+	m_dCellStart = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
+	m_dPairBuff = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = (m_maxHandles * 2 + 1) * sizeof(unsigned int);
+	m_dPairBuffStartCurr = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	memSize = numAABB * sizeof(float) * 4 * 2;
+	m_dAABB = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = (m_maxHandles + 2) * sizeof(unsigned int);
+	m_dPairScanChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
+	m_dPairsChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	m_dPairsContiguous = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = 3 * 4 * sizeof(float);
+	m_dBpParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+void bt3dGridBroadphaseOCL::prefillBuffers()
+{
+	memset(m_hBodiesHash, 0xFF, m_maxHandles*2*sizeof(unsigned int));
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_maxHandles * 2 * sizeof(unsigned int));
+	// now fill the rest (bitonic sorting works with size == pow of 2)
+	int remainder = m_hashSize - m_maxHandles;
+	if(remainder)
+	{
+		copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, remainder * 2 * sizeof(unsigned int), m_maxHandles * 2 * sizeof(unsigned int), 0);
+	}
+	copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); 
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+	copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+}
+
+
+void bt3dGridBroadphaseOCL::initKernel(int kernelId, char* pName)
+{
+	
+	cl_int ciErrNum;
+	cl_kernel kernel = clCreateKernel(m_cpProgram, pName, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	size_t wgSize;
+	ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	m_kernels[kernelId].m_Id = kernelId;
+	m_kernels[kernelId].m_kernel = kernel;
+	m_kernels[kernelId].m_name = pName;
+	m_kernels[kernelId].m_workgroupSize = (int)wgSize;
+	return;
+}
+
+void bt3dGridBroadphaseOCL::runKernelWithWorkgroupSize(int kernelId, int globalSize)
+{
+	if(globalSize <= 0)
+	{
+		return;
+	}
+	cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;
+	cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	int workgroupSize = btMin(64,m_kernels[kernelId].m_workgroupSize);
+
+	if(workgroupSize <= 0)
+	{ // let OpenCL library calculate workgroup size
+		size_t globalWorkSize[2];
+		globalWorkSize[0] = globalSize;
+		globalWorkSize[1] = 1;
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );
+	}
+	else
+	{
+		size_t localWorkSize[2], globalWorkSize[2];
+		//workgroupSize = btMin(workgroupSize, globalSize);
+		int num_t = globalSize / workgroupSize;
+		int num_g = num_t * workgroupSize;
+		if(num_g < globalSize)
+		{
+			num_t++;
+		}
+		localWorkSize[0]  = workgroupSize;
+		globalWorkSize[0] = num_t * workgroupSize;
+		localWorkSize[1] = 1;
+		globalWorkSize[1] = 1;
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );
+	}
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	ciErrNum = clFlush(m_cqCommandQue);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+
+void bt3dGridBroadphaseOCL::setKernelArg(int kernelId, int argNum, int argSize, void* argPtr)
+{
+    cl_int ciErrNum;
+	ciErrNum  = clSetKernelArg(m_kernels[kernelId].m_kernel, argNum, argSize, argPtr);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+
+void bt3dGridBroadphaseOCL::copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs, int hostOffs)
+{
+	if (size)
+	{
+		cl_int ciErrNum;
+		char* pHost = (char*)host + hostOffs;
+		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
+		GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+}
+
+void bt3dGridBroadphaseOCL::copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs, int devOffs)
+{
+	if (size)
+    {
+		cl_int ciErrNum;
+		char* pHost = (char*)host + hostOffs;
+		ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
+		GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+}
+
+
+
+//
+// overrides
+//
+
+
+void bt3dGridBroadphaseOCL::prepareAABB()
+{
+	btGpu3DGridBroadphase::prepareAABB();
+	copyArrayToDevice(m_dAABB, m_hAABB, sizeof(bt3DGrid3F1U) * 2 * (m_numHandles + m_numLargeHandles)); 
+	return;
+}
+
+void bt3dGridBroadphaseOCL::setParameters(bt3DGridBroadphaseParams* hostParams)
+{
+	btGpu3DGridBroadphase::setParameters(hostParams);
+	struct btParamsBpOCL
+	{
+		float m_invCellSize[4];
+		int   m_gridSize[4];
+	};
+	btParamsBpOCL hParams;
+	hParams.m_invCellSize[0] = m_params.m_invCellSizeX;
+	hParams.m_invCellSize[1] = m_params.m_invCellSizeY;
+	hParams.m_invCellSize[2] = m_params.m_invCellSizeZ;
+	hParams.m_invCellSize[3] = 0.f;
+	hParams.m_gridSize[0] = m_params.m_gridSizeX;
+	hParams.m_gridSize[1] = m_params.m_gridSizeY;
+	hParams.m_gridSize[2] = m_params.m_gridSizeZ;
+	hParams.m_gridSize[3] = m_params.m_maxBodiesPerCell;
+	copyArrayToDevice(m_dBpParams, &hParams, sizeof(btParamsBpOCL));
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::calcHashAABB()
+{
+	BT_PROFILE("calcHashAABB");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CALC_HASH_AABB, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+
+#else
+	btGpu3DGridBroadphase::calcHashAABB();
+#endif
+	
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::sortHash()
+{
+	BT_PROFILE("sortHash");
+#ifdef CL_PLATFORM_MINI_CL
+	//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+	btGpu3DGridBroadphase::sortHash();
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#else
+	
+//#define USE_HOST
+#ifdef USE_HOST
+	copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+	//adl::Buffer<unsigned int> keysIn,keysOut,valuesIn,valuesOut;
+	///adl::RadixSort32<adl::TYPE_CL>::execute(dataC,keysIn,keysOut,valuesIn,valuesOut,m_numHandles);
+	adl::HostBuffer<adl::SortData> inoutHost;
+	inoutHost.m_device = m_deviceHost;
+	inoutHost.m_ptr = (adl::SortData*)m_hBodiesHash;
+	inoutHost.m_size = m_numHandles;
+	adl::RadixSort<adl::TYPE_HOST>::execute(dataHost, inoutHost,m_numHandles);
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#else
+	{
+	clFinish(m_cqCommandQue);
+	BT_PROFILE("RadixSort32::execute");
+	adl::Buffer<adl::SortData> inout;
+	inout.m_device = this->m_deviceCL;
+	inout.m_size = m_numHandles;
+	inout.m_ptr = (adl::SortData*)m_dBodiesHash;
+	int actualHandles = m_numHandles;
+	int dataAlignment = adl::RadixSort32<adl::TYPE_CL>::DATA_ALIGNMENT;
+
+	if (actualHandles%dataAlignment)
+	{
+		actualHandles += dataAlignment-(actualHandles%dataAlignment);
+	}
+
+	adl::RadixSort32<adl::TYPE_CL>::execute(dataC,inout, actualHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	}
+	{
+		//BT_PROFILE("copyArrayFromDevice");
+	//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	}
+
+
+#endif //USE_HOST
+#endif
+
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::findCellStart()
+{
+#if 1
+	BT_PROFILE("findCellStart");
+		
+	#if defined(CL_PLATFORM_MINI_CL)
+		btGpu3DGridBroadphase::findCellStart();
+		copyArrayToDevice(m_dCellStart, m_hCellStart, m_numCells * sizeof(unsigned int));
+	#else
+			runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CLEAR_CELL_START, m_numCells);	
+			runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_CELL_START, m_numHandles);
+	#endif
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findCellStart();
+#endif
+
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::findOverlappingPairs()
+{
+#if 1
+	BT_PROFILE("findOverlappingPairs");
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findOverlappingPairs();
+	copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); 
+	copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+#endif
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::findPairsLarge()
+{
+	BT_PROFILE("findPairsLarge");
+#if 1
+	if(m_numLargeHandles)
+	{
+		setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 6, sizeof(int),(void*)&m_numLargeHandles);
+		runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, m_numHandles);
+	}
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findPairsLarge();
+#endif
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::computePairCacheChanges()
+{
+	BT_PROFILE("computePairCacheChanges");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+	copyArrayFromDevice( m_hPairScanChanged,m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+
+#else
+	btGpu3DGridBroadphase::computePairCacheChanges();
+	copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+	
+
+#endif
+	return;
+}
+
+
+
+
+extern cl_device_type deviceType;
+
+void bt3dGridBroadphaseOCL::scanOverlappingPairBuff(bool copyToCpu)
+{
+
+	//Intel/CPU version doesn't handlel Adl scan well
+#if 0
+	{
+		copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+		btGpu3DGridBroadphase::scanOverlappingPairBuff();
+		copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+		m_numPrefixSum = m_hPairScanChanged[m_numHandles+1];
+		clFinish(m_cqCommandQue);
+		//memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+	}
+#else
+	{
+
+	//	copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+	//	btGpu3DGridBroadphase::scanOverlappingPairBuff();
+
+		adl::Buffer<unsigned int> destBuffer;
+		
+		{
+			BT_PROFILE("copy GPU->GPU");
+		
+			destBuffer.m_ptr = (unsigned int*)m_dPairScanChanged;
+			destBuffer.m_device = m_deviceCL;
+			destBuffer.m_size =  sizeof(unsigned int)*(m_numHandles+2);
+			m_deviceCL->copy(m_srcClBuffer, &destBuffer,m_numHandles,1,1);
+
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+		}
+
+		{
+			BT_PROFILE("PrefixScan");
+			
+			adl::PrefixScan<adl::TYPE_CL>::execute(gData1,*m_srcClBuffer,destBuffer, m_numHandles+2,&m_numPrefixSum);
+			
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+		//if (m_numPrefixSum>0x1000)
+		//	{
+		//		printf("error m_numPrefixSum==%d\n",m_numPrefixSum);
+		//	}
+
+		}
+
+#if 0
+		unsigned int* verifyhPairScanChanged = new unsigned int[m_maxHandles + 2];
+		memset(verifyhPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+
+		copyArrayFromDevice(verifyhPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
+		clFinish(m_cqCommandQue);
+
+		/*for (int i=0;i<m_numHandles+2;i++)
+		{
+			if (verifyhPairScanChanged[i] != m_hPairScanChanged[i])
+			{
+				printf("hello!\n");
+			}
+		}
+		*/
+
+#endif
+
+
+		if (1)
+		{
+			
+			//the data 
+			if (copyToCpu)
+			{
+				BT_PROFILE("copy GPU -> CPU");
+				copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+			}
+
+		}
+
+	}
+#endif
+
+	
+}
+
+
+
+void bt3dGridBroadphaseOCL::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("btCuda_squeezeOverlappingPairBuff");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, m_numHandles);
+//	btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScanChanged, m_dPairsChanged, m_dAABB, m_numHandles);
+	
+	//copyArrayFromDevice(m_hPairsChanged, m_dPairsChanged, sizeof(unsigned int) * m_numPrefixSum);//m_hPairScanChanged[m_numHandles+1]); //gSum
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::squeezeOverlappingPairBuff();
+#endif
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::resetPool(btDispatcher* dispatcher)
+{
+	btGpu3DGridBroadphase::resetPool(dispatcher);
+	prefillBuffers();
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
@@ -0,0 +1,146 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef BT3DGRIDBROADPHASEOCL_H
+#define BT3DGRIDBROADPHASEOCL_H
+
+#ifdef __APPLE__
+#ifdef USE_MINICL
+	#include <MiniCL/cl.h>
+#else
+	#include <MiniCL/cl.h>
+#endif
+//CL_PLATFORM_MINI_CL could be defined in build system
+#else
+//#include <GL/glew.h>
+// standard utility and system includes
+#ifdef USE_MINICL
+	#include <MiniCL/cl.h>
+#else
+	#include <CL/cl.h>
+#endif
+// Extra CL/GL include
+//#include <CL/cl_gl.h>
+#endif //__APPLE__
+
+namespace adl
+{
+	struct Device;
+	struct DeviceCL;
+};
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+#include "btGpu3DGridBroadphase.h"
+
+
+#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
+
+enum
+{
+	GRID3DOCL_KERNEL_CALC_HASH_AABB = 0,
+	GRID3DOCL_KERNEL_CLEAR_CELL_START,
+	GRID3DOCL_KERNEL_FIND_CELL_START,
+	GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS,
+	GRID3DOCL_KERNEL_FIND_PAIRS_LARGE,
+	GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES,
+	GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF,
+	GRID3DOCL_KERNEL_TOTAL
+};
+
+struct bt3dGridOCLKernelInfo
+{
+	int			m_Id;
+	cl_kernel	m_kernel;
+	char*		m_name;
+	int			m_workgroupSize;
+};
+
+
+///The bt3dGridBroadphaseOCL uses OpenCL-capable GPU to compute overlapping pairs
+
+class bt3dGridBroadphaseOCL : public btGpu3DGridBroadphase
+{
+protected:
+	int						m_hashSize;
+	cl_context				m_cxMainContext;
+	cl_device_id			m_cdDevice;
+	cl_command_queue		m_cqCommandQue;
+	cl_program				m_cpProgram;
+	bt3dGridOCLKernelInfo	m_kernels[GRID3DOCL_KERNEL_TOTAL];
+	// data buffers
+	cl_mem					m_dBodiesHash;
+	cl_mem					m_dCellStart;
+	cl_mem					m_dPairBuff; 
+	cl_mem					m_dPairBuffStartCurr;
+public:
+	cl_mem					m_dAABB;
+protected:
+	cl_mem					m_dPairScanChanged;
+	cl_mem					m_dPairsChanged;
+	cl_mem					m_dPairsContiguous;
+	cl_mem					m_dBpParams;
+
+	adl::Device*			m_deviceHost;
+	adl::DeviceCL*			m_deviceCL;
+	bool					m_ownsDevice;
+
+
+public:
+	unsigned int			m_numPrefixSum;
+
+	bt3dGridBroadphaseOCL(	btOverlappingPairCache* overlappingPairCache,
+							const btVector3& cellSize, 
+							int gridSizeX, int gridSizeY, int gridSizeZ, 
+							int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+							btScalar maxSmallProxySize,
+							int maxSmallProxiesPerCell = 8,
+							cl_context context = NULL,
+							cl_device_id device = NULL,
+							cl_command_queue queue = NULL,
+							adl::DeviceCL* deviceCL = 0
+							);
+	virtual ~bt3dGridBroadphaseOCL();
+
+protected:
+	void initCL(cl_context context, cl_device_id device, cl_command_queue queue);
+	void initKernels();
+	void allocateBuffers();
+	void prefillBuffers();
+	void initKernel(int kernelId, char* pName);
+	void allocateArray(void** devPtr, unsigned int size);
+	void freeArray(void* devPtr);
+	void runKernelWithWorkgroupSize(int kernelId, int globalSize);
+	void setKernelArg(int kernelId, int argNum, int argSize, void* argPtr);
+	void copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs = 0, int hostOffs = 0);
+	void copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs = 0, int devOffs = 0);
+
+// overrides
+	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff(bool copyToCpu=true);
+	virtual void squeezeOverlappingPairBuff();
+	virtual void resetPool(btDispatcher* dispatcher);
+};
+
+#endif //BT3DGRIDBROADPHASEOCL_H
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp
@@ -0,0 +1,626 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///The 3 following lines include the CPU implementation of the kernels, keep them in this order.
+#include "btGpuDefines.h"
+#include "btGpuUtilsSharedDefs.h"
+#include "btGpuUtilsSharedCode.h"
+
+
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+
+
+
+#include "btGpuDefines.h"
+#include "btGpuUtilsSharedDefs.h"
+
+#include "btGpu3DGridBroadphaseSharedDefs.h"
+
+#include "btGpu3DGridBroadphase.h"
+#include <string.h> //for memset
+
+
+#include <stdio.h>
+
+
+
+static bt3DGridBroadphaseParams s3DGridBroadphaseParams;
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	const btVector3& cellSize, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell) :
+	btSimpleBroadphase(maxSmallProxies,
+//				     new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
+				     new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxSmallProxySize, maxBodiesPerCell);
+}
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+										const btVector3& cellSize, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell) :
+	btSimpleBroadphase(maxSmallProxies, overlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxSmallProxySize, maxBodiesPerCell);
+}
+
+
+
+btGpu3DGridBroadphase::~btGpu3DGridBroadphase()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	_finalize();
+
+	
+}
+
+// returns 2^n : 2^(n+1) > val >= 2^n
+int btGpu3DGridBroadphase::getFloorPowOfTwo(int val)
+{
+	int mask = 0x40000000;
+	for(int k = 0; k < 30; k++, mask >>= 1)
+	{
+		if(mask & val)
+		{
+			break;
+		}
+	}
+	return mask;
+}
+
+
+
+void btGpu3DGridBroadphase::_initialize(	const btVector3& cellSize,
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell)
+{
+	// set various paramerers
+	m_ownsPairCache = true;
+	m_params.m_gridSizeX = getFloorPowOfTwo(gridSizeX);
+	m_params.m_gridSizeY = getFloorPowOfTwo(gridSizeY);
+	m_params.m_gridSizeZ = getFloorPowOfTwo(gridSizeZ);
+	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
+	m_numCells = m_params.m_numCells;
+	m_params.m_invCellSizeX = btScalar(1.f) / cellSize[0];
+	m_params.m_invCellSizeY = btScalar(1.f) / cellSize[1];
+	m_params.m_invCellSizeZ = btScalar(1.f) / cellSize[2];
+	m_maxRadius = maxSmallProxySize * btScalar(0.5f);
+	m_params.m_numBodies = m_numBodies;
+	m_params.m_maxBodiesPerCell = maxBodiesPerCell;
+
+	m_numLargeHandles = 0;						
+	m_maxLargeHandles = maxLargeProxies;
+
+	m_maxPairsPerBody = maxPairsPerBody;
+
+	m_LastLargeHandleIndex = -1;
+
+    assert(!m_bInitialized);
+	
+    // allocate host storage
+    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
+    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
+
+    m_hCellStart = new unsigned int[m_params.m_numCells];
+    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
+
+	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
+	// --------------- for now, init with m_maxPairsPerBody for each body
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+	//----------------
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	m_hAABB = new bt3DGrid3F1U[numAABB * 2]; // AABB Min & Max
+
+	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
+
+	m_hPairScanChanged = new unsigned int[m_maxHandles + 2];
+	memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+
+	m_hPairsChanged = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairsChanged,0,sizeof(int)*(m_maxHandles * m_maxPairsPerBody));
+
+	m_hAllOverlappingPairs= new MyUint2[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hAllOverlappingPairs,0,sizeof(MyUint2)*(m_maxHandles * m_maxPairsPerBody));
+
+
+// large proxies
+
+	// allocate handles buffer and put all handles on free list
+	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
+	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
+	m_firstFreeLargeHandle = 0;
+	{
+		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
+		{
+			m_pLargeHandles[i].SetNextFree(i + 1);
+			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
+		}
+		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
+	}
+
+// debug data
+	m_numPairsAdded = 0;
+	m_numOverflows = 0;
+
+	
+    m_bInitialized = true;
+}
+
+
+
+void btGpu3DGridBroadphase::_finalize()
+{
+    assert(m_bInitialized);
+    delete [] m_hBodiesHash;
+    delete [] m_hCellStart;
+    delete [] m_hPairBuffStartCurr;
+    delete [] m_hAABB;
+	delete [] m_hPairBuff;
+	delete [] m_hPairScanChanged;
+	delete [] m_hPairsChanged;
+	delete [] m_hAllOverlappingPairs;
+	btAlignedFree(m_pLargeHandlesRawPtr);
+	m_bInitialized = false;
+}
+
+
+
+void btGpu3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
+{
+	btSimpleBroadphase::calculateOverlappingPairs(dispatcher);
+
+	if(m_numHandles <= 0)
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+		return;
+	}
+	// update constants
+	{
+		BT_PROFILE("setParameters");
+		setParameters(&m_params);
+	}
+
+	// prepare AABB array
+	{
+		BT_PROFILE("prepareAABB");
+		prepareAABB();
+	}
+	// calculate hash
+	{
+		BT_PROFILE("calcHashAABB");
+		calcHashAABB();
+	}
+	{
+		BT_PROFILE("sortHash");
+		// sort bodies based on hash
+		sortHash();
+	}
+	// find start of each cell
+	{
+		BT_PROFILE("findCellStart");
+		findCellStart();
+	}
+	{
+		BT_PROFILE("findOverlappingPairs");
+		// findOverlappingPairs (small/small)
+		findOverlappingPairs();
+	}
+	// findOverlappingPairs (small/large)
+	{
+		BT_PROFILE("findPairsLarge");
+		findPairsLarge();
+	}
+	// add pairs to CPU cache
+	{
+		BT_PROFILE("computePairCacheChanges");
+		computePairCacheChanges();
+	}
+	{
+		BT_PROFILE("scanOverlappingPairBuff");
+		scanOverlappingPairBuff();
+	}
+	{
+		BT_PROFILE("squeezeOverlappingPairBuff");
+		squeezeOverlappingPairBuff();
+	}
+	{
+		BT_PROFILE("addPairsToCache");
+		addPairsToCache(dispatcher);
+	}
+	// find and add large/large pairs to CPU cache
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
+{
+	m_numPairsAdded = 0;
+	m_numPairsRemoved = 0;
+	for(int i = 0; i < m_numHandles; i++) 
+	{
+		unsigned int num = m_hPairScanChanged[i+2] - m_hPairScanChanged[i+1];
+		if(!num)
+		{
+			continue;
+		}
+		unsigned int* pInp = m_hPairsChanged + m_hPairScanChanged[i+1];
+		unsigned int index0 = m_hAABB[i * 2].uw;
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
+		for(unsigned int j = 0; j < num; j++)
+		{
+			unsigned int indx1_s = pInp[j];
+			unsigned int index1 = indx1_s & (~BT_3DGRID_PAIR_ANY_FLG);
+			btSimpleBroadphaseProxy* proxy1;
+			if(index1 < (unsigned int)m_maxHandles)
+			{
+				proxy1 = &m_pHandles[index1];
+			}
+			else
+			{
+				index1 -= m_maxHandles;
+				btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
+				proxy1 = &m_pLargeHandles[index1];
+			}
+			if(indx1_s & BT_3DGRID_PAIR_NEW_FLG)
+			{
+				m_pairCache->addOverlappingPair(proxy0,proxy1);
+				m_numPairsAdded++;
+			}
+			else
+			{
+				m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				m_numPairsRemoved++;
+			}
+		}
+	}
+}
+
+
+
+btBroadphaseProxy* btGpu3DGridBroadphase::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
+{
+	btBroadphaseProxy*  proxy;
+	bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
+	if(bIsLarge)
+	{
+		if (m_numLargeHandles >= m_maxLargeHandles)
+		{
+			///you have to increase the cell size, so 'large' proxies become 'small' proxies (fitting a cell)
+			btAssert(0);
+			return 0; //should never happen, but don't let the game crash ;-)
+		}
+		btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
+		int newHandleIndex = allocLargeHandle();
+		proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
+	}
+	else
+	{
+		proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
+	}
+	return proxy;
+}
+
+
+
+void btGpu3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
+{
+	bool bIsLarge = isLargeProxy(proxy);
+	if(bIsLarge)
+	{
+		
+		btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
+		freeLargeHandle(proxy0);
+		m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
+	}
+	else
+	{
+		btSimpleBroadphase::destroyProxy(proxy, dispatcher);
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::resetPool(btDispatcher* dispatcher)
+{
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax)
+{
+	btVector3 diag = aabbMax - aabbMin;
+	///use the bounding sphere radius of this bounding box, to include rotation
+	btScalar radius = diag.length() * btScalar(0.5f);
+	return (radius > m_maxRadius);
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
+{
+	return (proxy->getUid() >= (m_maxHandles+2));
+}
+
+
+
+void btGpu3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
+{
+	int i,j;
+	if (m_numLargeHandles <= 0)
+	{
+		return;
+	}
+	int new_largest_index = -1;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		new_largest_index = i;
+		for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
+		{
+			btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
+			btAssert(proxy0 != proxy1);
+			btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
+			btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
+			if(aabbOverlap(p0,p1))
+			{
+				if (!m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->addOverlappingPair(proxy0,proxy1);
+				}
+			} 
+			else
+			{
+				if(m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				}
+			}
+		}
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
+{
+	btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
+	for (int i=0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
+		rayCallback.process(proxy);
+	}
+}
+
+
+
+//
+// overrides for CPU version
+//
+
+
+
+void btGpu3DGridBroadphase::prepareAABB()
+{
+	BT_PROFILE("prepareAABB");
+	bt3DGrid3F1U* pBB = m_hAABB;
+	int i;
+	int new_largest_index = -1;
+	unsigned int num_small = 0;
+	for(i = 0; i <= m_LastHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_small;
+		pBB++;
+		num_small++;
+	}
+	m_LastHandleIndex = new_largest_index;
+	new_largest_index = -1;
+	unsigned int num_large = 0;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i + m_maxHandles;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_large + m_maxHandles;
+		pBB++;
+		num_large++;
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	// paranoid checks
+	btAssert(num_small == m_numHandles);
+	btAssert(num_large == m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::setParameters(bt3DGridBroadphaseParams* hostParams)
+{
+	s3DGridBroadphaseParams = *hostParams;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::calcHashAABB()
+{
+	BT_PROFILE("bt3DGrid_calcHashAABB");
+	btGpu_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::sortHash()
+{
+	class bt3DGridHashKey
+	{
+	public:
+	   unsigned int hash;
+	   unsigned int index;
+	   void quickSort(bt3DGridHashKey* pData, int lo, int hi)
+	   {
+			int i=lo, j=hi;
+			bt3DGridHashKey x = pData[(lo+hi)/2];
+			do
+			{    
+				while(pData[i].hash > x.hash) i++; 
+				while(x.hash > pData[j].hash) j--;
+				if(i <= j)
+				{
+					bt3DGridHashKey t = pData[i];
+					pData[i] = pData[j];
+					pData[j] = t;
+					i++; j--;
+				}
+			} while(i <= j);
+			if(lo < j) pData->quickSort(pData, lo, j);
+			if(i < hi) pData->quickSort(pData, i, hi);
+	   }
+	};
+	BT_PROFILE("bt3DGrid_sortHash");
+	bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
+	pHash->quickSort(pHash, 0, m_numHandles - 1);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findCellStart()
+{
+	BT_PROFILE("bt3DGrid_findCellStart");
+	btGpu_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findOverlappingPairs()
+{
+	BT_PROFILE("bt3DGrid_findOverlappingPairs");
+	btGpu_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findPairsLarge()
+{
+	BT_PROFILE("bt3DGrid_findPairsLarge");
+	btGpu_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::computePairCacheChanges()
+{
+	BT_PROFILE("bt3DGrid_computePairCacheChanges");
+	btGpu_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hAABB, m_numHandles);
+	return;
+}
+
+
+void btGpu3DGridBroadphase::scanOverlappingPairBuff(bool copyToCpu)
+{
+	BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
+	unsigned int sum = 0;
+	m_hPairScanChanged[0]=0;
+	for(int i = 0; i <= m_numHandles+1; i++) 
+	{
+		unsigned int delta = m_hPairScanChanged[i];
+		m_hPairScanChanged[i] = sum;
+		sum += delta;
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
+	//btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hPairsChanged, m_hAABB, m_numHandles);
+	btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, (unsigned int*)m_hAllOverlappingPairs, m_hAABB, m_numHandles);
+	
+	return;
+}
+
+
+
+#include "btGpu3DGridBroadphaseSharedCode.h"
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.h
@@ -0,0 +1,154 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASE_H
+#define BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+struct MyUint2
+{
+	int x;
+	int y;
+};
+
+//----------------------------------------------------------------------------------------
+
+///The btGpu3DGridBroadphase uses GPU-style code compiled for CPU to compute overlapping pairs
+
+class btGpu3DGridBroadphase : public btSimpleBroadphase
+{
+protected:
+	bool			m_bInitialized;
+    unsigned int	m_numBodies;
+    unsigned int	m_numCells;
+	unsigned int	m_maxPairsPerBody;
+    unsigned int	m_maxBodiesPerCell;
+	bt3DGridBroadphaseParams m_params;
+	btScalar		m_maxRadius;
+	// CPU data
+    unsigned int*	m_hBodiesHash;
+    unsigned int*	m_hCellStart;
+	unsigned int*	m_hPairBuffStartCurr;
+	bt3DGrid3F1U*	m_hAABB;
+	unsigned int*	m_hPairBuff;
+	unsigned int*	m_hPairScanChanged;
+	unsigned int*	m_hPairsChanged;
+	MyUint2*		m_hAllOverlappingPairs;
+// large proxies
+	int		m_numLargeHandles;						
+	int		m_maxLargeHandles;						
+	int		m_LastLargeHandleIndex;							
+	btSimpleBroadphaseProxy* m_pLargeHandles;
+	void* m_pLargeHandlesRawPtr;
+	int		m_firstFreeLargeHandle;
+	int allocLargeHandle()
+	{
+		btAssert(m_numLargeHandles < m_maxLargeHandles);
+		int freeLargeHandle = m_firstFreeLargeHandle;
+		m_firstFreeLargeHandle = m_pLargeHandles[freeLargeHandle].GetNextFree();
+		m_numLargeHandles++;
+		if(freeLargeHandle > m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex = freeLargeHandle;
+		}
+		return freeLargeHandle;
+	}
+	void freeLargeHandle(btSimpleBroadphaseProxy* proxy)
+	{
+		int handle = int(proxy - m_pLargeHandles);
+		btAssert((handle >= 0) && (handle < m_maxHandles));
+		if(handle == m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex--;
+		}
+		proxy->SetNextFree(m_firstFreeLargeHandle);
+		m_firstFreeLargeHandle = handle;
+		proxy->m_clientObject = 0;
+		m_numLargeHandles--;
+	}
+	bool isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax);
+	bool isLargeProxy(btBroadphaseProxy* proxy);
+// debug
+	unsigned int	m_numPairsAdded;
+	unsigned int	m_numPairsRemoved;
+	unsigned int	m_numOverflows;
+// 
+public:
+	virtual int getNumOverlap()
+	{
+		return m_hPairScanChanged[m_numHandles+1];
+	}
+	virtual MyUint2* getOverlap()
+	{
+		return m_hAllOverlappingPairs;
+	}
+	// NOTE : for better results gridSizeX, gridSizeY and gridSizeZ should be powers of 2 
+	btGpu3DGridBroadphase(const btVector3& cellSize, 
+					   int gridSizeX, int gridSizeY, int gridSizeZ, 
+					   int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+					   int maxBodiesPerCell = 8);
+	btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+						const btVector3& cellSize, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+						int maxBodiesPerCell = 8);
+	virtual ~btGpu3DGridBroadphase();
+	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
+
+	virtual btBroadphaseProxy*	createProxy(const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
+	virtual void	destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
+	virtual void	rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
+	virtual void	resetPool(btDispatcher* dispatcher);
+
+	static int		getFloorPowOfTwo(int val); // returns 2^n : 2^(n+1) > val >= 2^n
+
+protected:
+	void _initialize(	const btVector3& cellSize, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+						int maxBodiesPerCell);
+	void _finalize();
+	void addPairsToCache(btDispatcher* dispatcher);
+	void addLarge2LargePairsToCache(btDispatcher* dispatcher);
+
+// overrides for CPU version
+	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff(bool copyToCpu=true);
+	virtual void squeezeOverlappingPairBuff();
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif //BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedCode.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedCode.h
@@ -0,0 +1,428 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// calculate position in uniform grid
+BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
+{
+    int3 gridPos;
+    gridPos.x = (int)floor(p.x * BT_GPU_params.m_invCellSizeX) & (BT_GPU_params.m_gridSizeX - 1);
+    gridPos.y = (int)floor(p.y * BT_GPU_params.m_invCellSizeY) & (BT_GPU_params.m_gridSizeY - 1);
+    gridPos.z = (int)floor(p.z * BT_GPU_params.m_invCellSizeZ) & (BT_GPU_params.m_gridSizeZ - 1);
+    return gridPos;
+} // bt3DGrid_calcGridPos()
+
+//----------------------------------------------------------------------------------------
+
+// calculate address in grid from position (clamping to edges)
+BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
+{
+	gridPos.x &= (BT_GPU_params.m_gridSizeX - 1);
+	gridPos.y &= (BT_GPU_params.m_gridSizeY - 1);
+	gridPos.z &= (BT_GPU_params.m_gridSizeZ - 1);
+    return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
+} // bt3DGrid_calcGridHash()
+
+//----------------------------------------------------------------------------------------
+
+// calculate grid hash value for each body using its AABB
+BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index*2];
+	bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // store grid hash and body index
+    pHash[index] = BT_GPU_make_uint2(gridHash, index);
+} // calcHashAABBD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	// Load hash data into shared memory so that we can look 
+	// at neighboring body's hash value without loading
+	// two hash values per thread
+	BT_GPU___shared__ uint sharedHash[257];
+	sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
+	if((index > 0) && (BT_GPU_threadIdx.x == 0))
+	{
+		// first thread in block must load neighbor body hash
+		volatile uint2 prevData = pHash[index-1];
+		sharedHash[0] = prevData.x;
+	}
+	BT_GPU___syncthreads();
+	if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
+	{
+		cellStart[sortedData.x] = index;
+	}
+} // findCellStartD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
+{
+	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
+			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
+			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
+} // cudaTestAABBOverlap()
+ 
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ void findPairsInCell(	int3	gridPos,
+										uint    index,
+										uint2*  pHash,
+										uint*   pCellStart,
+										bt3DGrid3F1U* pAABB, 
+										uint*   pPairBuff,
+										uint2*	pPairBuffStartCurr,
+										uint	numBodies)
+{
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // get start of bucket for this cell
+    uint bucketStart = pCellStart[gridHash];
+    if (bucketStart == 0xffffffff)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+    bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2); 
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+	uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
+	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        uint2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		uint unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
+			bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
+			if(cudaTestAABBOverlap(min0, max0, min1, max1))
+			{
+				uint handleIndex2 = min1.uw;
+				uint k;
+				for(k = 0; k < curr; k++)
+				{
+					uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+					curr++;
+				}
+			}
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsInCell()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findOverlappingPairsD(	bt3DGrid3F1U*	pAABB, uint2* pHash, uint* pCellStart, 
+												uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) {
+        for(int y=-1; y<=1; y++) {
+            for(int x=-1; x<=1; x++) {
+                findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
+            }
+        }
+    }
+} // findOverlappingPairsD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findPairsLargeD(	bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
+										uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		uint indx2 = numBodies + i;
+		bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
+		bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
+		if(cudaTestAABBOverlap(min0, max0, min1, max1))
+		{
+			uint k;
+			uint handleIndex2 =  min1.uw;
+			for(k = 0; k < curr; k++)
+			{
+				uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsLargeD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, 
+												uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint *pInp = pPairBuff + start;
+	uint num_changes = 0;
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		if(((*pInp) & BT_3DGRID_PAIR_ANY_FLG))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} // computePairCacheChangesD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
+												   uint2* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint* pInp = pPairBuff + start;
+	uint2* pOut = pPairOut + pPairScan[index+1];
+	uint* pOut2 = pInp;
+	uint num = 0; 
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
+		//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		{
+			pOut->x = handleIndex;
+			pOut->y = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
+
+			pOut++;
+		}
+		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
+		{
+			*pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
+			pOut2++;
+			num++;
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
+} // squeezeOverlappingPairBuffD()
+
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               E N D   O F    K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    // execute the kernel
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
+    // check if kernel invocation generated an error
+    BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
+} // calcHashAABB()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+	BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
+	BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
+} // findCellStart()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findOverlappingPairs()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findPairsLarge()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
+} // computePairCacheChanges()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan,  unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint2*)pPairOut,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
+} // btCuda_squeezeOverlappingPairBuff()
+
+//------------------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedDefs.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedDefs.h
@@ -0,0 +1,61 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+#define BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies);
+
+void BT_GPU_PREF(findCellStart)(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
+
+void BT_GPU_PREF(findOverlappingPairs)(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies);
+
+void BT_GPU_PREF(findPairsLarge)(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge);
+
+void BT_GPU_PREF(computePairCacheChanges)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedTypes.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedTypes.h
@@ -0,0 +1,64 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+#define BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+
+//----------------------------------------------------------------------------------------
+
+#define BT_3DGRID_PAIR_FOUND_FLG (0x40000000)
+#define BT_3DGRID_PAIR_NEW_FLG   (0x20000000)
+#define BT_3DGRID_PAIR_ANY_FLG   (BT_3DGRID_PAIR_FOUND_FLG | BT_3DGRID_PAIR_NEW_FLG)
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGridBroadphaseParams 
+{
+	unsigned int	m_gridSizeX;
+	unsigned int	m_gridSizeY;
+	unsigned int	m_gridSizeZ;
+	unsigned int	m_numCells;
+	float			m_invCellSizeX;
+	float			m_invCellSizeY;
+	float			m_invCellSizeZ;
+	unsigned int	m_numBodies;
+	unsigned int	m_maxBodiesPerCell;
+};
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGrid3F1U
+{
+	float			fx;
+	float			fy;
+	float			fz;
+	unsigned int	uw;
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuDefines.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuDefines.h
@@ -0,0 +1,211 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+// definitions for "GPU on CPU" code
+
+
+#ifndef BT_GPU_DEFINES_H
+#define BT_GPU_DEFINES_H
+
+typedef unsigned int uint;
+
+struct int2
+{
+	int x, y;
+};
+
+struct uint2
+{
+	unsigned int x, y;
+};
+
+struct int3
+{
+	int x, y, z;
+};
+
+struct uint3
+{
+	unsigned int x, y, z;
+};
+
+struct float4
+{
+	float x, y, z, w;
+};
+
+struct float3
+{
+	float x, y, z;
+};
+
+
+#define BT_GPU___device__ inline
+#define BT_GPU___devdata__
+#define BT_GPU___constant__
+#define BT_GPU_max(a, b) ((a) > (b) ? (a) : (b))
+#define BT_GPU_min(a, b) ((a) < (b) ? (a) : (b))
+#define BT_GPU_params s3DGridBroadphaseParams
+#define BT_GPU___mul24(a, b) ((a)*(b))
+#define BT_GPU___global__ inline
+#define BT_GPU___shared__ static
+#define BT_GPU___syncthreads()
+#define CUDART_PI_F SIMD_PI
+
+static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+#define BT_GPU_make_uint2(x, y) bt3dGrid_make_uint2(x, y)
+
+static inline int3 bt3dGrid_make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
+
+static inline float3 bt3dGrid_make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_float3(x, y, z) bt3dGrid_make_float3(x, y, z)
+
+static inline float3 bt3dGrid_make_float34(float4 f)
+{
+  float3 t; t.x = f.x; t.y = f.y; t.z = f.z; return t;
+}
+#define BT_GPU_make_float34(f) bt3dGrid_make_float34(f)
+
+static inline float3 bt3dGrid_make_float31(float f)
+{
+  float3 t; t.x = t.y = t.z = f; return t;
+}
+#define BT_GPU_make_float31(x) bt3dGrid_make_float31(x)
+
+static inline float4 bt3dGrid_make_float42(float3 v, float f)
+{
+  float4 t; t.x = v.x; t.y = v.y; t.z = v.z; t.w = f; return t;
+}
+#define BT_GPU_make_float42(a, b) bt3dGrid_make_float42(a, b) 
+
+static inline float4 bt3dGrid_make_float44(float a, float b, float c, float d)
+{
+  float4 t; t.x = a; t.y = b; t.z = c; t.w = d; return t;
+}
+#define BT_GPU_make_float44(a, b, c, d) bt3dGrid_make_float44(a, b, c, d) 
+
+inline int3 operator+(int3 a, int3 b)
+{
+    return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+inline float4 operator+(const float4& a, const float4& b)
+{
+	float4 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; r.w = a.w+b.w; return r;
+}
+inline float4 operator*(const float4& a, float fact)
+{
+	float4 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; r.w = a.w*fact; return r;
+}
+inline float4 operator*(float fact, float4& a)
+{
+	return (a * fact);
+}
+inline float4& operator*=(float4& a, float fact)
+{
+	a = fact * a;
+	return a;
+}
+inline float4& operator+=(float4& a, const float4& b)
+{
+	a = a + b;
+	return a;
+}
+
+inline float3 operator+(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; return r;
+}
+inline float3 operator-(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x-b.x; r.y = a.y-b.y; r.z = a.z-b.z; return r;
+}
+static inline float bt3dGrid_dot(float3& a, float3& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+#define BT_GPU_dot(a,b) bt3dGrid_dot(a,b)
+
+static inline float bt3dGrid_dot4(float4& a, float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+#define BT_GPU_dot4(a,b) bt3dGrid_dot4(a,b)
+
+static inline float3 bt3dGrid_cross(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.y*b.z-a.z*b.y; r.y = -a.x*b.z+a.z*b.x; r.z = a.x*b.y-a.y*b.x;	return r;
+}
+#define BT_GPU_cross(a,b) bt3dGrid_cross(a,b)
+
+
+inline float3 operator*(const float3& a, float fact)
+{
+	float3 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; return r;
+}
+
+
+inline float3& operator+=(float3& a, const float3& b)
+{
+	a = a + b;
+	return a;
+}
+inline float3& operator-=(float3& a, const float3& b)
+{
+	a = a - b;
+	return a;
+}
+inline float3& operator*=(float3& a, float fact)
+{
+	a = a * fact;
+	return a;
+}
+inline float3 operator-(const float3& v)
+{
+	float3 r; r.x = -v.x; r.y = -v.y; r.z = -v.z; return r;
+}
+
+
+#define BT_GPU_FETCH(a, b) a[b]
+#define BT_GPU_FETCH4(a, b) a[b]
+#define BT_GPU_PREF(func) btGpu_##func
+#define BT_GPU_SAFE_CALL(func) func
+#define BT_GPU_Memset memset
+#define BT_GPU_MemcpyToSymbol(a, b, c) memcpy(&a, b, c)
+#define BT_GPU_BindTexture(a, b, c, d)
+#define BT_GPU_UnbindTexture(a)
+
+static uint2 s_blockIdx, s_blockDim, s_threadIdx;
+#define BT_GPU_blockIdx s_blockIdx
+#define BT_GPU_blockDim s_blockDim
+#define BT_GPU_threadIdx s_threadIdx
+#define BT_GPU_EXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
+
+#define BT_GPU_CHECK_ERROR(s)
+
+
+#endif //BT_GPU_DEFINES_H
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedCode.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedCode.h
@@ -0,0 +1,55 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared code for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  will be compiled by both CPU and CUDA compilers
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpuUtilsSharedDefs.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+} // iDivUp()
+
+//----------------------------------------------------------------------------------------
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
+{
+    numThreads = BT_GPU_min(blockSize, n);
+    numBlocks = BT_GPU_PREF(iDivUp)(n, numThreads);
+} // computeGridSize()
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedDefs.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedDefs.h
@@ -0,0 +1,52 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+// Shared definitions for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+#ifndef BTGPUUTILSDHAREDDEFS_H
+#define BTGPUUTILSDHAREDDEFS_H
+
+
+extern "C"
+{
+
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b);
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads);
+
+void BT_GPU_PREF(allocateArray)(void** devPtr, unsigned int size);
+void BT_GPU_PREF(freeArray)(void* devPtr);
+void BT_GPU_PREF(copyArrayFromDevice)(void* host, const void* device, unsigned int size);
+void BT_GPU_PREF(copyArrayToDevice)(void* device, const void* host, unsigned int size);
+void BT_GPU_PREF(registerGLBufferObject(unsigned int vbo));
+void* BT_GPU_PREF(mapGLBufferObject(unsigned int vbo));
+void BT_GPU_PREF(unmapGLBufferObject(unsigned int vbo));
+
+
+} // extern "C"
+
+
+#endif // BTGPUUTILSDHAREDDEFS_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/premake4.lua
@@ -0,0 +1,5 @@
+
+	include "AMD"
+--	include "Intel"
+--	include "NVIDIA"
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/AMD/premake4.lua
@@ -0,0 +1,23 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_intialize_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+--		includedirs {"..","../../../../include/gpu_research"}
+		
+		files {
+			"../main.cpp",
+			"../btOpenCLUtils.cpp",
+			"../btOpenCLUtils.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/Intel/premake4.lua
@@ -0,0 +1,23 @@
+	
+	hasCL = findOpenCL_Intel()
+	
+	if (hasCL) then
+
+		project "OpenCL_intialize_Intel"
+
+		initOpenCL_Intel()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+--		includedirs {"..","../../../../include/gpu_research"}
+		
+		files {
+			"../main.cpp",
+			"../btOpenCLUtils.cpp",
+			"../btOpenCLUtils.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/NVIDIA/premake4.lua
@@ -0,0 +1,23 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_intialize_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+--		includedirs {"..","../../../../include/gpu_research"}
+		
+		files {
+			"../main.cpp",
+			"../btOpenCLUtils.cpp",
+			"../btOpenCLUtils.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLInclude.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLInclude.h
@@ -0,0 +1,43 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_OPENCL_INCLUDE_H
+#define BT_OPENCL_INCLUDE_H
+
+
+#ifdef __APPLE__
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#else
+#include <OpenCL/cl.h>
+#endif
+#else
+#ifdef USE_MINICL
+#include <MiniCL/cl.h>
+#else
+#include <CL/cl.h>
+#ifdef _WIN32
+#include "CL/cl_gl.h"
+#endif //_WIN32
+#endif
+#endif //__APPLE__
+
+#include <assert.h>
+#include <stdio.h>
+#define oclCHECKERROR(a, b) if((a)!=(b)) { printf("OCL Error : %d\n", (a)); assert((a) == (b)); }
+
+
+#endif //BT_OPENCL_INCLUDE_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.cpp
@@ -0,0 +1,731 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//original author: Roman Ponomarev
+//cleanup by Erwin Coumans
+
+#include <string.h>
+
+#include "btOpenCLUtils.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define BT_MAX_CL_DEVICES 16 //who needs 16 devices?
+
+#ifdef _WIN32
+#include <Windows.h>
+#include <assert.h>
+
+#define btAssert assert
+#endif
+
+//Set the preferred platform vendor using the OpenCL SDK
+static char* spPlatformVendor = 
+#if defined(CL_PLATFORM_MINI_CL)
+"MiniCL, SCEA";
+#elif defined(CL_PLATFORM_AMD)
+"Advanced Micro Devices, Inc.";
+#elif defined(CL_PLATFORM_NVIDIA)
+"NVIDIA Corporation";
+#elif defined(CL_PLATFORM_INTEL)
+"Intel(R) Corporation";
+#else
+"Unknown Vendor";
+#endif
+
+#ifndef CL_PLATFORM_MINI_CL
+#ifdef _WIN32
+#include "CL/cl_gl.h"
+#endif //_WIN32
+#endif
+
+int btOpenCLUtils::getNumPlatforms(cl_int* pErrNum)
+{
+	cl_uint numPlatforms=0;
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL) 
+			*pErrNum = ciErrNum;
+	}
+	return numPlatforms;
+}
+
+const char* btOpenCLUtils::getSdkVendorName()
+{
+	return spPlatformVendor;
+}
+
+cl_platform_id btOpenCLUtils::getPlatform(int platformIndex, cl_int* pErrNum)
+{
+	cl_platform_id platform = 0;
+
+	cl_uint numPlatforms;
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	
+	if (platformIndex>=0 && platformIndex<numPlatforms)
+	{
+		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL) 
+				*pErrNum = ciErrNum;
+			return platform;
+		}
+
+		platform = platforms[platformIndex];
+
+		delete[] platforms;
+	}
+
+	return platform;
+}
+
+void btOpenCLUtils::getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo& platformInfo)
+{
+	cl_int ciErrNum;
+
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VENDOR,BT_MAX_STRING_LENGTH,platformInfo.m_platformVendor,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_NAME,BT_MAX_STRING_LENGTH,platformInfo.m_platformName,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+	ciErrNum = clGetPlatformInfo(	platform,CL_PLATFORM_VERSION,BT_MAX_STRING_LENGTH,platformInfo.m_platformVersion,NULL);
+	oclCHECKERROR(ciErrNum,CL_SUCCESS);
+}
+
+cl_context btOpenCLUtils::createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC, int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	cl_context retContext = 0;
+	cl_int ciErrNum=0;
+
+	/*     
+	* If we could find our platform, use it. Otherwise pass a NULL and get whatever the     
+	* implementation thinks we should be using.     
+	*/
+	cl_context_properties cps[7] = {0,0,0,0,0,0,0};
+	cps[0] = CL_CONTEXT_PLATFORM;
+	cps[1] = (cl_context_properties)platform;
+	if (pGLContext && pGLDC)
+	{
+		cps[2] = CL_GL_CONTEXT_KHR;
+		cps[3] = (cl_context_properties)pGLContext;
+		cps[4] = CL_WGL_HDC_KHR;
+		cps[5] = (cl_context_properties)pGLDC;
+	}
+
+	cl_uint num_entries = BT_MAX_CL_DEVICES;
+ 	cl_device_id devices[BT_MAX_CL_DEVICES];
+
+	cl_uint num_devices=-1;
+
+	ciErrNum = clGetDeviceIDs(	
+		platform,
+		deviceType,
+ 		num_entries,
+ 		devices,
+ 		&num_devices);
+
+	cl_context_properties* cprops = (NULL == platform) ? NULL : cps;
+
+	if (pGLContext)
+	{
+		//search for the GPU that relates to the OpenCL context
+		for (int i=0;i<num_devices;i++)
+		{
+			retContext = clCreateContext(cprops,1,&devices[i],NULL,NULL,&ciErrNum);
+			if (ciErrNum==CL_SUCCESS)
+				break;
+		}
+	}
+	else
+	{
+		if (preferredDeviceIndex>=0 && preferredDeviceIndex<num_devices)
+		{
+			//create a context of the preferred device index
+			retContext = clCreateContext(cprops,1,&devices[preferredDeviceIndex],NULL,NULL,&ciErrNum);
+		} else
+		{
+			//create a context of all devices
+			retContext = clCreateContext(cprops,num_devices,devices,NULL,NULL,&ciErrNum);
+		}
+	}
+	if(pErrNum != NULL) 
+	{
+		*pErrNum = ciErrNum;
+	};
+
+	return retContext;
+}
+
+cl_context btOpenCLUtils::createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLContext, void* pGLDC , int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	cl_uint numPlatforms;
+	cl_context retContext = 0;
+	
+	cl_int ciErrNum = clGetPlatformIDs(0, NULL, &numPlatforms);
+	if(ciErrNum != CL_SUCCESS)
+	{
+		if(pErrNum != NULL) *pErrNum = ciErrNum;
+		return NULL;
+	}
+	if(numPlatforms > 0)     
+	{        
+		cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+		ciErrNum = clGetPlatformIDs(numPlatforms, platforms, NULL);
+		if(ciErrNum != CL_SUCCESS)
+		{
+			if(pErrNum != NULL) *pErrNum = ciErrNum;
+			return NULL;
+		}
+		int i;
+
+
+		for ( i = 0; i < numPlatforms; ++i)         
+		{            
+			char pbuf[128];            
+			ciErrNum = clGetPlatformInfo(	platforms[i],
+				CL_PLATFORM_VENDOR,                                       
+				sizeof(pbuf),                                       
+				pbuf,                                       
+				NULL);
+			if(ciErrNum != CL_SUCCESS)
+			{
+				if(pErrNum != NULL) *pErrNum = ciErrNum;
+				return NULL;
+			}
+
+			if (preferredPlatformIndex>=0 && i==preferredPlatformIndex)
+			{
+				cl_platform_id tmpPlatform = platforms[0];
+				platforms[0] = platforms[i];
+				platforms[i] = tmpPlatform;
+				break;
+			} else
+			{
+				if(!strcmp(pbuf, spPlatformVendor))
+				{
+					cl_platform_id tmpPlatform = platforms[0];
+					platforms[0] = platforms[i];
+					platforms[i] = tmpPlatform;
+					break;
+				}
+			}
+		}
+
+		for (i = 0; i < numPlatforms; ++i)         
+		{
+			cl_platform_id platform = platforms[i];
+			assert(platform);
+
+			retContext = btOpenCLUtils::createContextFromPlatform(platform,deviceType,pErrNum,pGLContext,pGLDC,preferredDeviceIndex);
+
+			if (retContext)
+			{
+//				printf("OpenCL platform details:\n");
+				btOpenCLPlatformInfo platformInfo;
+
+				btOpenCLUtils::getPlatformInfo(platform, platformInfo);
+
+				printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+				printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+				printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+
+				break;
+			}
+		}
+
+		delete[] platforms;    
+	}
+	return retContext;
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxMainContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+cl_device_id btOpenCLUtils::getDevice(cl_context cxMainContext, int deviceIndex)
+{
+	size_t szParmDataBytes;
+	cl_device_id* cdDevices;
+
+	// get the list of devices associated with context
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
+
+	if( szParmDataBytes / sizeof(cl_device_id) < deviceIndex ) {
+		return (cl_device_id)-1;
+	}
+
+	cdDevices = (cl_device_id*) malloc(szParmDataBytes);
+
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
+
+	cl_device_id device = cdDevices[deviceIndex];
+	free(cdDevices);
+
+	return device;
+}
+
+int btOpenCLUtils::getNumDevices(cl_context cxMainContext)
+{
+	size_t szParamDataBytes;
+	clGetContextInfo(cxMainContext, CL_CONTEXT_DEVICES, 0, NULL, &szParamDataBytes);
+	int device_count = (int) szParamDataBytes/ sizeof(cl_device_id);
+	return device_count;
+}
+
+void btOpenCLUtils::printDeviceInfo(cl_device_id device)
+{
+	btOpenCLDeviceInfo info;
+	getDeviceInfo(device,info);
+
+	printf("  CL_DEVICE_NAME: \t\t\t%s\n", info.m_deviceName);
+	printf("  CL_DEVICE_VENDOR: \t\t\t%s\n", info.m_deviceVendor);
+	printf("  CL_DRIVER_VERSION: \t\t\t%s\n", info.m_driverVersion);
+
+	if( info.m_deviceType & CL_DEVICE_TYPE_CPU )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_GPU )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
+	if( info.m_deviceType & CL_DEVICE_TYPE_ACCELERATOR )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
+	if( info.m_deviceType & CL_DEVICE_TYPE_DEFAULT )
+		printf("  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
+
+	printf("  CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", info.m_computeUnits);
+	printf("  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", info.m_workitemDims);
+	printf("  CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", info.m_workItemSize[0], info.m_workItemSize[1], info.m_workItemSize[2]);
+	printf("  CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", info.m_workgroupSize);
+	printf("  CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", info.m_clockFrequency);
+	printf("  CL_DEVICE_ADDRESS_BITS:\t\t%u\n", info.m_addressBits);
+	printf("  CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_maxMemAllocSize/ (1024 * 1024)));
+	printf("  CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(info.m_globalMemSize/ (1024 * 1024)));
+	printf("  CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", info.m_errorCorrectionSupport== CL_TRUE ? "yes" : "no");
+	printf("  CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", info.m_localMemType == 1 ? "local" : "global");
+	printf("  CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(info.m_localMemSize / 1024));
+	printf("  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(info.m_constantBufferSize / 1024));
+	if( info.m_queueProperties  & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
+		printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");    
+	if( info.m_queueProperties & CL_QUEUE_PROFILING_ENABLE )
+		printf("  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
+
+	printf("  CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", info.m_imageSupport);
+
+	printf("  CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", info.m_maxReadImageArgs);
+	printf("  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", info.m_maxWriteImageArgs);
+	printf("\n  CL_DEVICE_IMAGE <dim>"); 
+	printf("\t\t\t2D_MAX_WIDTH\t %u\n", info.m_image2dMaxWidth);
+	printf("\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", info.m_image2dMaxHeight);
+	printf("\t\t\t\t\t3D_MAX_WIDTH\t %u\n", info.m_image3dMaxWidth);
+	printf("\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", info.m_image3dMaxHeight);
+	printf("\t\t\t\t\t3D_MAX_DEPTH\t %u\n", info.m_image3dMaxDepth);
+	if (info.m_deviceExtensions != 0) 
+		printf("\n  CL_DEVICE_EXTENSIONS:%s\n",info.m_deviceExtensions);
+	else 
+		printf("  CL_DEVICE_EXTENSIONS: None\n");
+	printf("  CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t"); 
+	printf("CHAR %u, SHORT %u, INT %u,LONG %u, FLOAT %u, DOUBLE %u\n\n\n", 
+		info.m_vecWidthChar, info.m_vecWidthShort, info.m_vecWidthInt, info.m_vecWidthLong,info.m_vecWidthFloat, info.m_vecWidthDouble); 
+
+
+}
+
+void btOpenCLUtils::getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo& info)
+{
+
+	// CL_DEVICE_NAME
+	clGetDeviceInfo(device, CL_DEVICE_NAME, BT_MAX_STRING_LENGTH, &info.m_deviceName, NULL);
+
+	// CL_DEVICE_VENDOR
+	clGetDeviceInfo(device, CL_DEVICE_VENDOR, BT_MAX_STRING_LENGTH, &info.m_deviceVendor, NULL);
+
+	// CL_DRIVER_VERSION
+	clGetDeviceInfo(device, CL_DRIVER_VERSION, BT_MAX_STRING_LENGTH, &info.m_driverVersion, NULL);
+
+	// CL_DEVICE_INFO
+	clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(cl_device_type), &info.m_deviceType, NULL);
+
+	// CL_DEVICE_MAX_COMPUTE_UNITS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(info.m_computeUnits), &info.m_computeUnits, NULL);
+
+	// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(info.m_workitemDims), &info.m_workitemDims, NULL);
+
+	// CL_DEVICE_MAX_WORK_ITEM_SIZES
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(info.m_workItemSize), &info.m_workItemSize, NULL);
+
+	// CL_DEVICE_MAX_WORK_GROUP_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(info.m_workgroupSize), &info.m_workgroupSize, NULL);
+
+	// CL_DEVICE_MAX_CLOCK_FREQUENCY
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(info.m_clockFrequency), &info.m_clockFrequency, NULL);
+
+	// CL_DEVICE_ADDRESS_BITS
+	clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(info.m_addressBits), &info.m_addressBits, NULL);
+
+	// CL_DEVICE_MAX_MEM_ALLOC_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(info.m_maxMemAllocSize), &info.m_maxMemAllocSize, NULL);
+
+	// CL_DEVICE_GLOBAL_MEM_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(info.m_globalMemSize), &info.m_globalMemSize, NULL);
+
+	// CL_DEVICE_ERROR_CORRECTION_SUPPORT
+	clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(info.m_errorCorrectionSupport), &info.m_errorCorrectionSupport, NULL);
+
+	// CL_DEVICE_LOCAL_MEM_TYPE
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(info.m_localMemType), &info.m_localMemType, NULL);
+
+	// CL_DEVICE_LOCAL_MEM_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(info.m_localMemSize), &info.m_localMemSize, NULL);
+
+	// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
+	clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(info.m_constantBufferSize), &info.m_constantBufferSize, NULL);
+
+	// CL_DEVICE_QUEUE_PROPERTIES
+	clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(info.m_queueProperties), &info.m_queueProperties, NULL);
+
+	// CL_DEVICE_IMAGE_SUPPORT
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(info.m_imageSupport), &info.m_imageSupport, NULL);
+
+	// CL_DEVICE_MAX_READ_IMAGE_ARGS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(info.m_maxReadImageArgs), &info.m_maxReadImageArgs, NULL);
+
+	// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
+	clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(info.m_maxWriteImageArgs), &info.m_maxWriteImageArgs, NULL);
+
+	// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &info.m_image2dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &info.m_image2dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &info.m_image3dMaxWidth, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &info.m_image3dMaxHeight, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &info.m_image3dMaxDepth, NULL);
+
+	// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
+	clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, BT_MAX_STRING_LENGTH, &info.m_deviceExtensions, NULL);
+
+	// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &info.m_vecWidthChar, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &info.m_vecWidthShort, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &info.m_vecWidthInt, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &info.m_vecWidthLong, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &info.m_vecWidthFloat, NULL);
+	clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &info.m_vecWidthDouble, NULL);
+}
+
+static const char* strip2(const char* name, const char* pattern)
+{
+	  size_t const patlen = strlen(pattern);
+  	size_t patcnt = 0;
+	  const char * oriptr;
+	  const char * patloc;
+		// find how many times the pattern occurs in the original string
+	  for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+	  {
+		patcnt++;
+	  }
+	  return oriptr;
+}
+
+cl_program btOpenCLUtils::compileCLProgramFromString(cl_context clContext, cl_device_id device, const char* kernelSource, cl_int* pErrNum, const char* additionalMacros , const char* clFileNameForCaching)
+{
+
+	cl_program m_cpProgram=0;
+	cl_int status;
+
+	char binaryFileName[522];
+
+	if (clFileNameForCaching)
+	{
+		
+		char deviceName[256];
+		char driverVersion[256];
+		clGetDeviceInfo(device, CL_DEVICE_NAME, 256, &deviceName, NULL);
+		clGetDeviceInfo(device, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+
+		
+		const char* strippedName = strip2(clFileNameForCaching,"\\");
+		strippedName = strip2(strippedName,"/");
+
+		sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedName, deviceName,driverVersion );
+		//printf("searching for %s\n", binaryFileName);
+
+		bool fileUpToDate = false;
+		bool binaryFileValid=false;
+
+		FILETIME modtimeBinary; 
+
+#ifdef _WIN32
+		CreateDirectory("cache",0);
+		{
+			
+			HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+			if (binaryFileHandle ==INVALID_HANDLE_VALUE)
+			{
+				DWORD errorCode;
+				errorCode = GetLastError();
+				switch (errorCode)
+				{
+				case ERROR_FILE_NOT_FOUND:
+					{
+						printf("\nCached file not found %s\n", binaryFileName);
+						break;
+					}
+				case ERROR_PATH_NOT_FOUND:
+					{
+						printf("\nCached file path not found %s\n", binaryFileName);
+						break;
+					}
+				default:
+					{
+						printf("\nFailed reading cached file with errorCode = %d\n", errorCode);
+					}
+				}
+			} else
+			{
+				if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
+				{
+					DWORD errorCode;
+					errorCode = GetLastError();
+					printf("\nGetFileTime errorCode = %d\n", errorCode);
+				} else
+				{
+					binaryFileValid = true;
+				}
+				CloseHandle(binaryFileHandle);
+			}
+
+			if (binaryFileValid)
+			{
+				HANDLE srcFileHandle = CreateFile(clFileNameForCaching,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+				if (srcFileHandle!=INVALID_HANDLE_VALUE)
+				{
+					FILETIME modtimeSrc; 
+					if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
+					{
+						DWORD errorCode;
+						errorCode = GetLastError();
+						printf("\nGetFileTime errorCode = %d\n", errorCode);
+					}
+					if (  ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
+						||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
+					{
+						fileUpToDate=true;
+					} else
+					{
+						printf("\nCached binary file out-of-date (%s)\n",binaryFileName);
+					}
+					CloseHandle(srcFileHandle);
+				} 
+				else
+				{
+#ifdef _DEBUG
+					DWORD errorCode;
+					errorCode = GetLastError();
+					switch (errorCode)
+					{
+					case ERROR_FILE_NOT_FOUND:
+						{
+							printf("\nSrc file not found %s\n", clFileNameForCaching);
+							break;
+						}
+					case ERROR_PATH_NOT_FOUND:
+						{
+							printf("\nSrc path not found %s\n", clFileNameForCaching);
+							break;
+						}
+					default:
+						{
+							printf("\nnSrc file reading errorCode = %d\n", errorCode);
+						}
+					}
+
+					//we should make sure the src file exists so we can verify the timestamp with binary
+					assert(0);
+#else
+					//if we cannot find the source, assume it is OK in release builds
+					fileUpToDate = true;
+#endif
+				}
+			}
+			
+
+		}
+
+		if( fileUpToDate)
+		{
+			FILE* file = fopen(binaryFileName, "rb");
+			if (file)
+			{
+				fseek( file, 0L, SEEK_END );
+				size_t binarySize = ftell( file );
+				rewind( file );
+				char* binary = new char[binarySize];
+				fread( binary, sizeof(char), binarySize, file );
+				fclose( file );
+
+				m_cpProgram = clCreateProgramWithBinary( clContext, 1,&device, &binarySize, (const unsigned char**)&binary, 0, &status );
+				btAssert( status == CL_SUCCESS );
+				status = clBuildProgram( m_cpProgram, 1, &device, additionalMacros, 0, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				if( status != CL_SUCCESS )
+				{
+					char *build_log;
+					size_t ret_val_size;
+					clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+					build_log = new char[ret_val_size+1];
+					clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+					build_log[ret_val_size] = '\0';
+					printf("%s\n", build_log);
+					delete build_log;
+					btAssert(0);
+					m_cpProgram = 0;
+				}
+				delete[] binary;
+			}
+		}
+#endif //_WIN32
+		
+	}
+	
+	if (!m_cpProgram)
+	{
+		cl_kernel kernel;
+		cl_int localErrNum;
+		size_t program_length = strlen(kernelSource);
+
+		m_cpProgram = clCreateProgramWithSource(clContext, 1, (const char**)&kernelSource, &program_length, &localErrNum);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+
+		// Build the program with 'mad' Optimization option
+
+
+	#ifdef MAC
+		char* flags = "-cl-mad-enable -DMAC -DGUID_ARG";
+	#else
+		//const char* flags = "-DGUID_ARG= -fno-alias";
+		const char* flags = "-DGUID_ARG= ";
+	#endif
+
+		char* compileFlags = new char[strlen(additionalMacros) + strlen(flags) + 5];
+		sprintf(compileFlags, "%s %s", flags, additionalMacros);
+		localErrNum = clBuildProgram(m_cpProgram, 1, &device, compileFlags, NULL, NULL);
+		if (localErrNum!= CL_SUCCESS)
+		{
+			char *build_log;
+			size_t ret_val_size;
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+			build_log = new char[ret_val_size+1];
+			clGetProgramBuildInfo(m_cpProgram, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+			// to be carefully, terminate with \0
+			// there's no information in the reference whether the string is 0 terminated or not
+			build_log[ret_val_size] = '\0';
+
+
+			printf("Error in clBuildProgram, Line %u in file %s, Log: \n%s\n !!!\n\n", __LINE__, __FILE__, build_log);
+			delete[] build_log;
+			if (pErrNum)
+				*pErrNum = localErrNum;
+			return 0;
+		}
+
+		if( clFileNameForCaching )
+		{	//	write to binary
+
+			cl_uint numAssociatedDevices;
+			status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
+			btAssert( status == CL_SUCCESS );
+			if (numAssociatedDevices==1)
+			{
+
+				size_t binarySize;
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				char* binary = new char[binarySize];
+
+				status = clGetProgramInfo( m_cpProgram, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+				btAssert( status == CL_SUCCESS );
+
+				{
+					FILE* file = fopen(binaryFileName, "wb");
+					if (file)
+					{
+						fwrite( binary, sizeof(char), binarySize, file );
+						fclose( file );
+					} else
+					{
+						printf("cannot write file %s\n", binaryFileName);
+					}
+				}
+
+				delete [] binary;
+			}
+		}
+		delete [] compileFlags;
+	}
+
+	return m_cpProgram;
+}
+
+
+cl_kernel btOpenCLUtils::compileCLKernelFromString(cl_context clContext, cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum, cl_program prog, const char* additionalMacros )
+{
+	printf("compiling kernel %s ",kernelName);
+	cl_kernel kernel;
+	cl_int localErrNum;
+	size_t program_length = strlen(kernelSource);
+
+
+	cl_program m_cpProgram = prog;
+	if (!m_cpProgram)
+	{
+		m_cpProgram = compileCLProgramFromString(clContext,device,kernelSource,pErrNum, additionalMacros);
+	}
+
+
+	// Create the kernel
+	kernel = clCreateKernel(m_cpProgram, kernelName, &localErrNum);
+	if (localErrNum != CL_SUCCESS)
+	{
+		printf("Error in clCreateKernel, Line %u in file %s, cannot find kernel function %s !!!\n\n", __LINE__, __FILE__, kernelName);
+		if (pErrNum)
+			*pErrNum = localErrNum;
+		return 0;
+	}
+
+	if (!prog && m_cpProgram)
+	{
+		clReleaseProgram(m_cpProgram);
+	}
+	printf("ready. \n");
+
+
+	if (pErrNum)
+			*pErrNum = CL_SUCCESS;
+	return kernel;
+
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/btOpenCLUtils.h
@@ -0,0 +1,104 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2011 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//original author: Roman Ponomarev
+//cleanup by Erwin Coumans
+
+#ifndef BT_OPENCL_UTILS_H
+#define BT_OPENCL_UTILS_H
+
+#include "btOpenCLInclude.h"
+
+
+#define BT_MAX_STRING_LENGTH 1024
+
+struct btOpenCLDeviceInfo
+{
+	char m_deviceName[BT_MAX_STRING_LENGTH];
+	char m_deviceVendor[BT_MAX_STRING_LENGTH];
+	char m_driverVersion[BT_MAX_STRING_LENGTH];
+	char m_deviceExtensions[BT_MAX_STRING_LENGTH];
+
+	cl_device_type		m_deviceType;
+	cl_uint 				m_computeUnits;
+	size_t 					m_workitemDims;
+	size_t 					m_workItemSize[3];
+	size_t 					m_image2dMaxWidth;
+	size_t 					m_image2dMaxHeight;
+	size_t 					m_image3dMaxWidth;
+	size_t 					m_image3dMaxHeight;
+	size_t 					m_image3dMaxDepth;
+	size_t 					m_workgroupSize;
+	cl_uint 				m_clockFrequency;
+	cl_ulong				m_constantBufferSize;
+	cl_ulong				m_localMemSize;
+	cl_ulong				m_globalMemSize;
+    cl_bool					m_errorCorrectionSupport;
+	cl_device_local_mem_type m_localMemType;
+	cl_uint					m_maxReadImageArgs;
+	cl_uint					m_maxWriteImageArgs;
+
+
+
+	cl_uint 				m_addressBits;
+	cl_ulong				m_maxMemAllocSize;
+	cl_command_queue_properties m_queueProperties;
+	cl_bool					m_imageSupport;
+	cl_uint					m_vecWidthChar;
+	cl_uint					m_vecWidthShort;
+	cl_uint					m_vecWidthInt;
+	cl_uint					m_vecWidthLong;
+	cl_uint					m_vecWidthFloat;
+	cl_uint					m_vecWidthDouble;
+
+};
+
+struct btOpenCLPlatformInfo
+{
+	char m_platformVendor[BT_MAX_STRING_LENGTH];
+	char m_platformName[BT_MAX_STRING_LENGTH];
+	char m_platformVersion[BT_MAX_STRING_LENGTH];
+};
+
+class btOpenCLUtils
+{
+public:
+
+	/// CL Context optionally takes a GL context. This is a generic type because we don't really want this code
+	/// to have to understand GL types. It is a HGLRC in _WIN32 or a GLXContext otherwise.
+	static cl_context 	createContextFromType(cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0, int preferredDeviceIndex = -1, int preferredPlatformIndex= - 1);
+	
+	static int getNumDevices(cl_context cxMainContext);
+	static cl_device_id getDevice(cl_context cxMainContext, int nr);
+	static void getDeviceInfo(cl_device_id device, btOpenCLDeviceInfo& info);
+	static void printDeviceInfo(cl_device_id device);
+
+	static cl_kernel compileCLKernelFromString( cl_context clContext,cl_device_id device, const char* kernelSource, const char* kernelName, cl_int* pErrNum=0, cl_program prog=0,const char* additionalMacros = "" );
+
+	//optional
+	static cl_program compileCLProgramFromString( cl_context clContext,cl_device_id device, const char* kernelSource, cl_int* pErrNum=0,const char* additionalMacros = "" , const char* srcFileNameForCaching=0);
+
+	//the following optional APIs provide access using specific platform information
+	static int getNumPlatforms(cl_int* pErrNum=0);
+	///get the nr'th platform, where nr is in the range [0..getNumPlatforms)
+	static cl_platform_id getPlatform(int nr, cl_int* pErrNum=0);
+	static void getPlatformInfo(cl_platform_id platform, btOpenCLPlatformInfo& platformInfo);
+	static const char* getSdkVendorName();
+	static cl_context 	createContextFromPlatform(cl_platform_id platform, cl_device_type deviceType, cl_int* pErrNum, void* pGLCtx = 0, void* pGLDC = 0,int preferredDeviceIndex = -1, int preferredPlatformIndex= -1);
+};
+
+
+
+#endif // BT_OPENCL_UTILS_H
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/main.cpp
@@ -0,0 +1,92 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "btOpenCLUtils.h"
+#include <stdio.h>
+
+cl_context			g_cxMainContext;
+cl_command_queue	g_cqCommandQue;
+
+
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+	
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+	const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
+
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+	int numPlatforms = btOpenCLUtils::getNumPlatforms();
+	printf("Num Platforms = %d\n", numPlatforms);
+
+	for (int i=0;i<numPlatforms;i++)
+	{
+		cl_platform_id platform = btOpenCLUtils::getPlatform(i);
+		btOpenCLPlatformInfo platformInfo;
+		btOpenCLUtils::getPlatformInfo(platform,platformInfo);
+		printf("--------------------------------\n");
+		printf("Platform info for platform nr %d:\n",i);
+		printf("  CL_PLATFORM_VENDOR: \t\t\t%s\n",platformInfo.m_platformVendor);
+		printf("  CL_PLATFORM_NAME: \t\t\t%s\n",platformInfo.m_platformName);
+		printf("  CL_PLATFORM_VERSION: \t\t\t%s\n",platformInfo.m_platformVersion);
+		
+		cl_context context = btOpenCLUtils::createContextFromPlatform(platform,deviceType,&ciErrNum);
+		
+		int numDevices = btOpenCLUtils::getNumDevices(context);
+		printf("Num Devices = %d\n", numDevices);
+		for (int j=0;j<numDevices;j++)
+		{
+			cl_device_id dev = btOpenCLUtils::getDevice(context,j);
+			btOpenCLDeviceInfo devInfo;
+			btOpenCLUtils::getDeviceInfo(dev,devInfo);
+			btOpenCLUtils::printDeviceInfo(dev);
+		}
+
+		clReleaseContext(context);
+	}
+
+	///Easier method to initialize OpenCL using createContextFromType for a GPU
+	deviceType = CL_DEVICE_TYPE_GPU;
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
+	g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
+
+	for (int i=0;i<numDev;i++)
+	{
+		cl_device_id		device;
+		device = btOpenCLUtils::getDevice(g_cxMainContext,i);
+		btOpenCLDeviceInfo clInfo;
+		btOpenCLUtils::getDeviceInfo(device,clInfo);
+		btOpenCLUtils::printDeviceInfo(device);
+		// create a command-queue
+		g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		//normally you would create and execute kernels using this command queue
+
+		clReleaseCommandQueue(g_cqCommandQue);
+	}
+
+	clReleaseContext(g_cxMainContext);
+		
+	return 0;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/basic_initialize/premake4.lua
@@ -0,0 +1,4 @@
+
+	include "AMD"
+	include "Intel"
+	include "NVIDIA"
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/AMD/premake4.lua
@@ -0,0 +1,49 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_broadphase_benchmark_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+		
+		includedirs {
+			"../../../rendering/BulletMath",
+			"../../primitives",
+			"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../findPairsOpenCL.cpp",
+			"../findPairsOpenCL.h",
+			"../btGridBroadphaseCL.cpp",
+			"../btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/Intel/premake4.lua
@@ -0,0 +1,49 @@
+	
+	hasCL = findOpenCL_Intel()
+	
+	if (hasCL) then
+
+		project "OpenCL_broadphase_benchmark_Intel"
+
+		initOpenCL_Intel()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../findPairsOpenCL.cpp",
+			"../findPairsOpenCL.h",
+			"../btGridBroadphaseCL.cpp",
+			"../btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/NVIDIA/premake4.lua
@@ -0,0 +1,49 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_broadphase_benchmark_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../findPairsOpenCL.cpp",
+			"../findPairsOpenCL.h",
+			"../btGridBroadphaseCL.cpp",
+			"../btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/broadphaseKernel.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/broadphaseKernel.cl
@@ -0,0 +1,335 @@
+MSTRINGIFY(
+
+typedef struct
+{
+	float4	m_row[3];
+} Matrix3x3;
+
+typedef unsigned int u32;
+
+
+typedef struct
+{
+	float4 m_pos;
+	float4 m_quat;
+	float4 m_linVel;
+	float4 m_angVel;
+
+	u32 m_shapeIdx;
+	u32 m_shapeType;
+	float m_invMass;
+	float m_restituitionCoeff;
+	float m_frictionCoeff;
+} Body;
+
+typedef struct
+{
+	Matrix3x3 m_invInertia;
+	Matrix3x3 m_initInvInertia;
+} Shape;
+
+
+__inline
+Matrix3x3 qtGetRotationMatrix(float4 quat)
+{
+	float4 quat2 = (float4)(quat.x*quat.x, quat.y*quat.y, quat.z*quat.z, 0.f);
+	Matrix3x3 out;
+
+	out.m_row[0].x=fabs(1-2*quat2.y-2*quat2.z);
+	out.m_row[0].y=fabs(2*quat.x*quat.y-2*quat.w*quat.z);
+	out.m_row[0].z=fabs(2*quat.x*quat.z+2*quat.w*quat.y);
+	out.m_row[0].w = 0.f;
+
+	out.m_row[1].x=fabs(2*quat.x*quat.y+2*quat.w*quat.z);
+	out.m_row[1].y=fabs(1-2*quat2.x-2*quat2.z);
+	out.m_row[1].z=fabs(2*quat.y*quat.z-2*quat.w*quat.x);
+	out.m_row[1].w = 0.f;
+
+	out.m_row[2].x=fabs(2*quat.x*quat.z-2*quat.w*quat.y);
+	out.m_row[2].y=fabs(2*quat.y*quat.z+2*quat.w*quat.x);
+	out.m_row[2].z=fabs(1-2*quat2.x-2*quat2.y);
+	out.m_row[2].w = 0.f;
+
+	return out;
+}
+
+
+typedef struct 
+{
+	float			fx;
+	float			fy;
+	float			fz;
+	unsigned int	uw;
+} btAABBCL;
+
+__inline
+Matrix3x3 mtTranspose(Matrix3x3 m)
+{
+	Matrix3x3 out;
+	out.m_row[0] = (float4)(m.m_row[0].x, m.m_row[1].x, m.m_row[2].x, 0.f);
+	out.m_row[1] = (float4)(m.m_row[0].y, m.m_row[1].y, m.m_row[2].y, 0.f);
+	out.m_row[2] = (float4)(m.m_row[0].z, m.m_row[1].z, m.m_row[2].z, 0.f);
+	return out;
+}
+
+__inline
+float dot3F4(float4 a, float4 b)
+{
+	float4 a1 = (float4)(a.xyz,0.f);
+	float4 b1 = (float4)(b.xyz,0.f);
+	return dot(a1, b1);
+}
+
+
+__inline
+Matrix3x3 mtMul(Matrix3x3 a, Matrix3x3 b)
+{
+	Matrix3x3 transB;
+	transB = mtTranspose( b );
+	Matrix3x3 ans;
+	//	why this doesn't run when 0ing in the for{}
+	a.m_row[0].w = 0.f;
+	a.m_row[1].w = 0.f;
+	a.m_row[2].w = 0.f;
+	for(int i=0; i<3; i++)
+	{
+//	a.m_row[i].w = 0.f;
+		ans.m_row[i].x = dot3F4(a.m_row[i],transB.m_row[0]);
+		ans.m_row[i].y = dot3F4(a.m_row[i],transB.m_row[1]);
+		ans.m_row[i].z = dot3F4(a.m_row[i],transB.m_row[2]);
+		ans.m_row[i].w = 0.f;
+	}
+	return ans;
+}
+
+
+//apply gravity
+//update world inverse inertia tensor
+//copy velocity from arrays to bodies
+//copy transforms from buffer to bodies
+
+__kernel void 
+  setupBodiesKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
+		   __global float4 *linVel,
+		   __global float4 *pAngVel,
+		   __global Body* gBodies, __global Shape* bodyInertias
+		   )
+{
+	int nodeID = get_global_id(0);
+		
+	float timeStep = 0.0166666f;
+	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254);
+
+	if( nodeID < numNodes )
+	{
+		float inverseMass = gBodies[nodeID].m_invMass;
+		if (inverseMass != 0.f)
+		{
+			float4 position = g_vertexBuffer[nodeID + startOffset/4];
+			float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
+
+			float4 gravityAcceleration = (float4)(0.f,-9.8f,0.f,0.f);
+			linVel[nodeID] += gravityAcceleration * timeStep;
+		
+			gBodies[nodeID].m_pos = position;
+			gBodies[nodeID].m_quat = orientation;
+
+			gBodies[nodeID].m_linVel = (float4)(linVel[nodeID].xyz,0.f);
+			gBodies[nodeID].m_angVel = (float4)(pAngVel[nodeID].xyz,0.f);
+
+			Matrix3x3 m = qtGetRotationMatrix( orientation);
+			Matrix3x3 mT = mtTranspose( m );
+
+			Matrix3x3 tmp = mtMul(m, bodyInertias[nodeID].m_initInvInertia);
+			Matrix3x3 tmp2 = mtMul(tmp, mT);
+			bodyInertias[nodeID].m_invInertia = tmp2;
+
+			//shapeInfo.m_invInertia = mtMul( mtMul( m, shapeInfo.m_initInvInertia ), mT );
+
+
+		} else
+		{
+			gBodies[nodeID].m_linVel = (float4)(0.f,0.f,0.f,0.f);
+			gBodies[nodeID].m_angVel = (float4)(0.f,0.f,0.f,0.f);
+		}
+
+
+	}
+}
+
+
+__kernel void 
+  copyVelocitiesKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
+		   __global float4 *linVel,
+		   __global float4 *pAngVel,
+		   __global Body* gBodies, __global Shape* bodyInertias
+		   )
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{
+		float inverseMass = gBodies[nodeID].m_invMass;
+		if (inverseMass != 0.f)
+		{
+			linVel[nodeID] = (float4)(gBodies[nodeID].m_linVel.xyz,0.f);
+			pAngVel[nodeID] = (float4)(gBodies[nodeID].m_angVel.xyz,0.f);
+		}
+	}
+}
+
+
+
+__kernel void 
+  initializeGpuAabbsSimple( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer, __global btAABBCL* pAABB)
+{
+	int nodeID = get_global_id(0);
+		
+	if( nodeID < numNodes )
+	{
+		float4 position = g_vertexBuffer[nodeID + startOffset/4];
+		float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
+		float4 color = g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes];
+		
+		float4 green = (float4)(.4f,1.f,.4f,1.f);
+		g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes] = green;
+		
+
+		float4 halfExtents = (float4)(1.01f,1.01f,1.01f,0.f);
+		//float4 extent=(float4)(1.f,1.f,1.f,0.f);
+
+		Matrix3x3 abs_b = qtGetRotationMatrix(orientation);
+
+		float4 extent = (float4) (
+			dot(abs_b.m_row[0],halfExtents),
+			dot(abs_b.m_row[1],halfExtents),
+			dot(abs_b.m_row[2],halfExtents),
+			0.f);
+		
+
+		pAABB[nodeID*2].fx = position.x-extent.x;
+		pAABB[nodeID*2].fy = position.y-extent.y;
+		pAABB[nodeID*2].fz = position.z-extent.z;
+		pAABB[nodeID*2].uw = nodeID;
+
+		pAABB[nodeID*2+1].fx = position.x+extent.x;
+		pAABB[nodeID*2+1].fy = position.y+extent.y;
+		pAABB[nodeID*2+1].fz = position.z+extent.z;
+		pAABB[nodeID*2+1].uw = nodeID;		
+	}
+}
+
+
+
+__kernel void 
+  initializeGpuAabbsFull( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer, __global Body* gBodies, __global btAABBCL* plocalShapeAABB, __global btAABBCL* pAABB)
+{
+	int nodeID = get_global_id(0);
+		
+	if( nodeID < numNodes )
+	{
+		float4 position = g_vertexBuffer[nodeID + startOffset/4];
+		float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
+		float4 color = g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes];
+		
+		float4 green = (float4)(.4f,1.f,.4f,1.f);
+		g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes] = green;
+		
+		int shapeIndex = gBodies[nodeID].m_shapeIdx;
+		if (shapeIndex>=0)
+		{
+			btAABBCL minAabb = plocalShapeAABB[shapeIndex*2];
+			btAABBCL maxAabb = plocalShapeAABB[shapeIndex*2+1];
+			
+			float4 halfExtents = ((float4)(maxAabb.fx - minAabb.fx,maxAabb.fy - minAabb.fy,maxAabb.fz - minAabb.fz,0.f))*0.5f;
+
+			Matrix3x3 abs_b = qtGetRotationMatrix(orientation);
+			float4 extent = (float4) (	dot(abs_b.m_row[0],halfExtents),dot(abs_b.m_row[1],halfExtents),dot(abs_b.m_row[2],halfExtents),0.f);
+		
+
+			pAABB[nodeID*2].fx = position.x-extent.x;
+			pAABB[nodeID*2].fy = position.y-extent.y;
+			pAABB[nodeID*2].fz = position.z-extent.z;
+			pAABB[nodeID*2].uw = nodeID;
+
+			pAABB[nodeID*2+1].fx = position.x+extent.x;
+			pAABB[nodeID*2+1].fy = position.y+extent.y;
+			pAABB[nodeID*2+1].fz = position.z+extent.z;
+			pAABB[nodeID*2+1].uw = nodeID;		
+		}
+	}
+}
+
+
+__kernel void 
+  broadphaseColorKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer, __global int2* pOverlappingPairs, const int numOverlap)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numOverlap )
+	{
+		int2 pair = pOverlappingPairs[nodeID];
+		float4 red = (float4)(1.f,0.4f,0.4f,1.f);
+		
+		g_vertexBuffer[pair.x + startOffset/4+numNodes+numNodes] = red;
+		g_vertexBuffer[pair.y + startOffset/4+numNodes+numNodes] = red;
+	}
+}
+
+
+
+__kernel void 
+  broadphaseKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer)
+{
+	int nodeID = get_global_id(0);
+	
+//	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254);
+	
+	if( nodeID < numNodes )
+	{
+		float4 position = g_vertexBuffer[nodeID + startOffset/4];
+		//float4 orientation = g_vertexBuffer[nodeID + startOffset/4+numNodes];
+		float4 color = g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes];
+		
+		float4 red = (float4)(1.f,0.f,0.f,0.f);
+		float4 green = (float4)(0.f,1.f,0.f,0.f);
+		float4 blue = (float4)(0.f,0.f,1.f,0.f);
+		float  overlap=0;
+		int equal = 0;
+		
+		g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes] = green;
+		
+		for (int i=0;i<numNodes;i++)
+		{
+			if (i!=nodeID)
+			{
+				float4 otherPosition = g_vertexBuffer[i + startOffset/4];
+				if ((otherPosition.x == position.x)&&
+					(otherPosition.y == position.y)&&
+					(otherPosition.z == position.z))
+						equal=1;
+				
+				
+				float distsqr = 
+						((otherPosition.x - position.x)* (otherPosition.x - position.x))+
+						((otherPosition.y - position.y)* (otherPosition.y - position.y))+
+						((otherPosition.z - position.z)* (otherPosition.z - position.z));
+				
+				if (distsqr<7.f)
+					overlap+=0.25f;
+			}
+		}
+		
+		
+		if (equal)
+		{
+				g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes]=blue;
+		} else
+		{
+			if (overlap>0.f)
+				g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes]=red*overlap;
+			else
+				g_vertexBuffer[nodeID + startOffset/4+numNodes+numNodes]=green;
+		}
+	}
+}
+
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.cpp
@@ -0,0 +1,231 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Roman Ponomarev, Erwin Coumans
+
+#ifdef RELEASE_ME
+#define COMPUTE_AABB_KERNEL_PATH "computeAabbKernelOCL.cl"
+#else
+#define COMPUTE_AABB_KERNEL_PATH "..\\..\\opencl\\broadphase_benchmark\\computeAabbKernelOCL"
+#endif
+
+
+#include "btGridBroadphaseCl.h"
+#include "LinearMath/btQuickprof.h"
+#include "Adl/Adl.h"
+#include "AdlPrimitives/Math/Math.h"
+
+#include "Adl/AdlKernel.h"
+#include "../basic_initialize/btOpenCLUtils.h"
+#define MSTRINGIFY(A) #A
+static const char* spComputeAabbSource= 
+#include "computeAabbKernelOCL.cl"
+
+struct  btTmpAabb
+{
+	float			minfx;
+	float			minfy;
+	float			minfz;
+	unsigned int	index0;
+	float			maxfx;
+	float			maxfy;
+	float			maxfz;
+	unsigned int	index1;	
+} ;
+
+
+
+
+btGridBroadphaseCl::btGridBroadphaseCl(	btOverlappingPairCache* overlappingPairCache,
+							const btVector3& cellSize, 
+							int gridSizeX, int gridSizeY, int gridSizeZ, 
+							int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+							btScalar maxSmallProxySize,
+							int maxSmallProxiesPerCell,
+							cl_context context,
+							cl_device_id device,
+							cl_command_queue queue,
+							adl::DeviceCL* deviceCL)
+:bt3dGridBroadphaseOCL(overlappingPairCache,cellSize,
+				gridSizeX, gridSizeY, gridSizeZ, 
+						maxSmallProxies, maxLargeProxies, maxPairsPerSmallProxy,
+						maxSmallProxySize,maxSmallProxiesPerCell,
+						context,device,queue,deviceCL)			
+{
+	m_computeAabbKernel = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"computeAabb","",spComputeAabbSource);
+
+	m_countOverlappingPairs = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"countOverlappingpairs","",spComputeAabbSource);
+
+	m_squeezePairCaches = m_deviceCL->getKernel(COMPUTE_AABB_KERNEL_PATH,"squeezePairCaches","",spComputeAabbSource);
+
+	m_aabbConstBuffer = new adl::Buffer<MyAabbConstData >(m_deviceCL,1,adl::BufferBase::BUFFER_CONST);
+
+	size_t memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)*2;
+	cl_int ciErrNum=0;
+	m_dAllOverlappingPairs = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+
+	memset(m_hAllOverlappingPairs, 0x00, sizeof(MyUint2)*m_maxHandles * m_maxPairsPerBody);
+	copyArrayToDevice(m_dAllOverlappingPairs, m_hAllOverlappingPairs, m_maxHandles * m_maxPairsPerBody * sizeof(MyUint2));
+
+	
+	
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	
+
+}
+
+btGridBroadphaseCl::~btGridBroadphaseCl()
+{
+	clReleaseMemObject(m_dAllOverlappingPairs);
+	
+	delete m_aabbConstBuffer;
+
+}
+
+
+
+void btGridBroadphaseCl::prepareAABB(float* positions, int numObjects)
+{
+	return;
+#if 0
+bt3dGridBroadphaseOCL::prepareAABB();
+#else
+	BT_PROFILE("prepareAABB");
+	bt3DGrid3F1U* pBB = m_hAABB;
+
+	int new_largest_index = numObjects;
+	unsigned int num_small = numObjects;
+	m_LastHandleIndex = new_largest_index;
+	new_largest_index = -1;
+	unsigned int num_large = 0;
+	m_LastLargeHandleIndex = new_largest_index;
+	// paranoid checks
+	//btAssert(num_small == m_numHandles);
+	//btAssert(num_large == m_numLargeHandles);
+
+	//copyArrayFromDevice( m_hAABB, m_dAABB, sizeof(bt3DGrid3F1U) * 2 * (m_numHandles + m_numLargeHandles));
+	//clFinish(m_cqCommandQue);
+#endif
+
+}
+void btGridBroadphaseCl::calcHashAABB()
+{
+	bt3dGridBroadphaseOCL::calcHashAABB();	
+}
+
+
+void btGridBroadphaseCl::calculateOverlappingPairs(float* positions, int numObjects)
+{
+	btDispatcher* dispatcher=0;
+
+	// update constants
+	{
+		BT_PROFILE("setParameters");
+		setParameters(&m_params);
+	}
+
+	// prepare AABB array
+	{
+		BT_PROFILE("prepareAABB");
+		prepareAABB(positions, numObjects);
+	}
+	// calculate hash
+	{
+		BT_PROFILE("calcHashAABB");
+		calcHashAABB();
+	}
+
+	{
+		BT_PROFILE("sortHash");
+		// sort bodies based on hash
+		sortHash();
+	}
+
+	// find start of each cell
+	{
+		BT_PROFILE("findCellStart");
+		findCellStart();
+	}
+	
+	{
+		BT_PROFILE("findOverlappingPairs");
+		// findOverlappingPairs (small/small)
+		findOverlappingPairs();
+	}
+
+	// add pairs to CPU cache
+	{
+		BT_PROFILE("computePairCacheChanges");
+#if 0
+		computePairCacheChanges();
+#else
+		int ciErrNum=0;
+
+		ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 0, sizeof(int), (void*)&numObjects);
+		ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+		ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+		ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+		ciErrNum=clSetKernelArg((cl_kernel)m_countOverlappingPairs->m_kernel, 4, sizeof(cl_mem),(void*)&m_dAABB);
+
+
+		size_t localWorkSize=64;
+		size_t numWorkItems = localWorkSize*((numObjects+ (localWorkSize)) / localWorkSize);
+
+	
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, (cl_kernel)m_countOverlappingPairs->m_kernel, 1, NULL, &numWorkItems, &localWorkSize, 0,0,0 );
+oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		ciErrNum = clFlush(m_cqCommandQue);
+#endif
+
+
+	}
+	{
+		BT_PROFILE("scanOverlappingPairBuff");
+		scanOverlappingPairBuff(false);
+	}
+	{
+		BT_PROFILE("squeezeOverlappingPairBuff");
+//#define FORCE_CPU
+#ifdef FORCE_CPU
+		bt3dGridBroadphaseOCL::squeezeOverlappingPairBuff();
+		copyArrayToDevice(m_dPairsChangedXY, m_hPairsChangedXY, sizeof( MyUint2) * m_numPrefixSum); //gSum
+#else
+		//squeezeOverlappingPairBuff();
+		int ciErrNum = 0;
+		ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 0, sizeof(int), (void*)&numObjects);
+		ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+		ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+		ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+		ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 4, sizeof(cl_mem),(void*)&m_dAllOverlappingPairs);
+		ciErrNum=clSetKernelArg((cl_kernel)m_squeezePairCaches->m_kernel, 5, sizeof(cl_mem),(void*)&m_dAABB);
+
+		size_t workGroupSize = 64;
+		size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
+
+	
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, (cl_kernel)m_squeezePairCaches->m_kernel, 1, NULL, &numWorkItems, &workGroupSize, 0,0,0 );
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		
+
+//		copyArrayFromDevice(m_hAllOverlappingPairs, m_dAllOverlappingPairs, sizeof(unsigned int) * m_numPrefixSum*2); //gSum
+//		clFinish(m_cqCommandQue);
+#endif
+
+	}
+
+
+	return;
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/btGridBroadphaseCL.h
@@ -0,0 +1,73 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Roman Ponomarev, Erwin Coumans
+
+#ifndef GRID_BROADPHASE_CL_H
+#define GRID_BROADPHASE_CL_H
+
+#include "../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h"
+
+#include "Adl/Adl.h"
+#include "Adl/AdlKernel.h"
+
+
+struct MyAabbConstData 
+{
+	int bla;
+	int numElem;
+};
+
+
+
+class btGridBroadphaseCl : public bt3dGridBroadphaseOCL
+{
+protected:
+
+	adl::Kernel*			m_computeAabbKernel;
+	adl::Kernel*			m_countOverlappingPairs;
+	adl::Kernel*			m_squeezePairCaches;
+
+
+	adl::Buffer<MyAabbConstData>*	m_aabbConstBuffer;
+
+
+	public:
+
+		cl_mem					m_dAllOverlappingPairs;
+
+		
+		btGridBroadphaseCl(	btOverlappingPairCache* overlappingPairCache,
+							const btVector3& cellSize, 
+							int gridSizeX, int gridSizeY, int gridSizeZ, 
+							int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+							btScalar maxSmallProxySize,
+							int maxSmallProxiesPerCell = 4,
+							cl_context context = NULL,
+							cl_device_id device = NULL,
+							cl_command_queue queue = NULL,
+							adl::DeviceCL* deviceCL=0
+							);
+		
+		virtual void prepareAABB(float* positions, int numObjects);
+		virtual void calcHashAABB();
+
+		void calculateOverlappingPairs(float* positions, int numObjects);
+		
+		virtual ~btGridBroadphaseCl();							
+	
+};
+
+#endif //GRID_BROADPHASE_CL_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/computeAabbKernelOCL.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/computeAabbKernelOCL.cl
@@ -0,0 +1,112 @@
+MSTRINGIFY(
+
+typedef struct 
+{
+	int bla;
+	int numElem;
+} MyAabbConstDataCL ;
+
+typedef struct 
+{
+	float			minfx;
+	float			minfy;
+	float			minfz;
+	unsigned int	index0;
+	float			maxfx;
+	float			maxfy;
+	float			maxfz;
+	unsigned int	index1;	
+} btAabbCL;
+
+
+__kernel void   computeAabb( __global btAabbCL* aabbs,__global float4* positions, MyAabbConstDataCL cb)
+{
+	int nodeID = get_global_id(0);
+		
+	if( nodeID < cb.numElem )
+	{
+		aabbs[nodeID].minfx = positions[nodeID].x -1.f;
+		aabbs[nodeID].minfy = positions[nodeID].y -1.f;
+		aabbs[nodeID].minfz = positions[nodeID].z -1.f;	
+		aabbs[nodeID].index0 = nodeID;	
+		aabbs[nodeID].maxfx = positions[nodeID].x +1.f;
+		aabbs[nodeID].maxfy = positions[nodeID].y +1.f;
+		aabbs[nodeID].maxfz = positions[nodeID].z +1.f;		
+		aabbs[nodeID].index1 = nodeID;
+	}
+}
+
+
+__kernel void countOverlappingpairs(	int numObjects,
+										__global int* pPairBuff, 
+										__global int2* pPairBuffStartCurr, 
+										__global int* pPairScan, 
+										__global float4* pAABB )
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int *pInp = pPairBuff + start;
+	int num_changes = 0;
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(((*pInp) & 0x60000000))//either new or existing pairs (ignore old non-overlapping pairs)
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} 
+
+
+__kernel void squeezePairCaches(	int numObjects,
+											__global int* pPairBuff, 
+											__global int2* pPairBuffStartCurr, 
+											__global int* pPairScan,
+											__global int2* pPairOut, 
+											__global float4* pAABB )
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int* pInp = pPairBuff + start;
+	__global int2* pOut = pPairOut + pPairScan[index+1];
+	__global int* pOut2 = pInp;
+	int num = 0; 
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(((*pInp) & 0x60000000))
+		{
+			int2	newpair;
+			newpair.x = handleIndex;
+			newpair.y = (*pInp) & (~0x60000000);
+			*pOut = newpair;
+			pOut++;
+		}
+		if((*pInp) & 0x60000000)
+		{
+			*pOut2 = (*pInp) & (~0x60000000);
+			pOut2++;
+			num++;
+		}
+	}
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = num;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+}
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/findPairsOpenCL.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/findPairsOpenCL.cpp
@@ -0,0 +1,204 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Roman Ponomarev, Erwin Coumans
+
+#include "findPairsOpenCL.h"
+#include "../basic_initialize/btOpenCLUtils.h"
+
+#define MSTRINGIFY(A) #A
+static char* broadphaseKernelString = 
+#include "broadphaseKernel.cl"
+
+#define GRID_BROADPHASE_PATH "..\\..\\opencl\\broadphase_benchmark\\broadphaseKernel.cl"
+
+
+
+
+void initFindPairs(btFindPairsIO& fpio,cl_context cxMainContext, cl_device_id device, cl_command_queue commandQueue, int maxHandles, int maxPairsPerBody)
+{
+
+	//m_proxies.push_back( proxy );
+
+	fpio.m_mainContext = cxMainContext;
+	fpio.m_cqCommandQue = commandQueue;
+	fpio.m_device = device;
+	cl_int pErrNum;
+	cl_program prog = btOpenCLUtils::compileCLProgramFromString(cxMainContext, device, broadphaseKernelString, &pErrNum ,"",GRID_BROADPHASE_PATH);
+
+	fpio.m_broadphaseBruteForceKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "broadphaseKernel" ,&pErrNum,prog);
+	fpio.m_initializeGpuAabbsKernelSimple = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "initializeGpuAabbsSimple" ,&pErrNum,prog);
+	fpio.m_initializeGpuAabbsKernelFull = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "initializeGpuAabbsFull" ,&pErrNum,prog);
+
+	fpio.m_broadphaseColorKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "broadphaseColorKernel" ,&pErrNum,prog);
+
+	fpio.m_setupBodiesKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "setupBodiesKernel" ,&pErrNum,prog);
+	fpio.m_copyVelocitiesKernel = btOpenCLUtils::compileCLKernelFromString(cxMainContext,device, broadphaseKernelString, "copyVelocitiesKernel" ,&pErrNum,prog);
+
+
+
+}
+
+void	findPairsOpenCLBruteForce(btFindPairsIO& fpio)
+{
+
+			int ciErrNum = 0;
+
+			int numObjects = fpio.m_numObjects;
+			int offset = fpio.m_positionOffset;
+
+			ciErrNum = clSetKernelArg(fpio.m_broadphaseBruteForceKernel, 0, sizeof(int), &offset);
+			ciErrNum = clSetKernelArg(fpio.m_broadphaseBruteForceKernel, 1, sizeof(int), &numObjects);
+			ciErrNum = clSetKernelArg(fpio.m_broadphaseBruteForceKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
+		
+			size_t numWorkItems = numObjects;///workGroupSize*((NUM_OBJECTS + (workGroupSize)) / workGroupSize);
+			size_t workGroupSize = 64;
+			ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_broadphaseBruteForceKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+void	setupGpuAabbsFull(btFindPairsIO& fpio, cl_mem bodies)
+{
+
+			int ciErrNum = 0;
+
+			int numObjects = fpio.m_numObjects;
+			int offset = fpio.m_positionOffset;
+
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 0, sizeof(int), &offset);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 1, sizeof(int), &numObjects);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 3, sizeof(cl_mem), (void*)&bodies);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 4, sizeof(cl_mem), (void*)&fpio.m_dlocalShapeAABB);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelFull, 5, sizeof(cl_mem), (void*)&fpio.m_dAABB);
+				size_t workGroupSize = 64;
+			size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
+		
+			ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_initializeGpuAabbsKernelFull, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+void	setupGpuAabbsSimple(btFindPairsIO& fpio)
+{
+
+			int ciErrNum = 0;
+
+			int numObjects = fpio.m_numObjects;
+			int offset = fpio.m_positionOffset;
+
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 0, sizeof(int), &offset);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 1, sizeof(int), &numObjects);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
+			ciErrNum = clSetKernelArg(fpio.m_initializeGpuAabbsKernelSimple, 3, sizeof(cl_mem), (void*)&fpio.m_dAABB);
+				size_t workGroupSize = 64;
+			size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
+		
+			ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_initializeGpuAabbsKernelSimple, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+
+void	setupBodies(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias)
+{
+	int ciErrNum = 0;
+
+	int numObjects = fpio.m_numObjects;
+	int offset = fpio.m_positionOffset;
+
+	ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 0, sizeof(int), &offset);
+	ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 1, sizeof(int), &fpio.m_numObjects);
+	ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
+
+	ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 3, sizeof(cl_mem), (void*)&linVelMem);
+	ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 4, sizeof(cl_mem), (void*)&angVelMem);
+	ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 5, sizeof(cl_mem), (void*)&bodies);
+	ciErrNum = clSetKernelArg(fpio.m_setupBodiesKernel, 6, sizeof(cl_mem), (void*)&bodyInertias);
+	
+	if (numObjects)
+	{
+		size_t workGroupSize = 64;
+		size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
+
+		ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_setupBodiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+
+}
+
+
+void	copyBodyVelocities(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias)
+{
+	int ciErrNum = 0;
+
+	int numObjects = fpio.m_numObjects;
+	int offset = fpio.m_positionOffset;
+
+	ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 0, sizeof(int), &offset);
+	ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 1, sizeof(int), &fpio.m_numObjects);
+	ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
+
+	ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 3, sizeof(cl_mem), (void*)&linVelMem);
+	ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 4, sizeof(cl_mem), (void*)&angVelMem);
+	ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 5, sizeof(cl_mem), (void*)&bodies);
+	ciErrNum = clSetKernelArg(fpio.m_copyVelocitiesKernel, 6, sizeof(cl_mem), (void*)&bodyInertias);
+	
+	if (numObjects)
+	{
+		size_t workGroupSize = 64;
+		size_t numWorkItems = workGroupSize*((numObjects+ (workGroupSize)) / workGroupSize);
+				
+		ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_copyVelocitiesKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+
+}
+
+void	colorPairsOpenCL(btFindPairsIO&	fpio)
+{
+	int ciErrNum = 0;
+
+	int numObjects = fpio.m_numObjects;
+	int offset = fpio.m_positionOffset;
+
+	ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 0, sizeof(int), &offset);
+	ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 1, sizeof(int), &fpio.m_numObjects);
+	ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 2, sizeof(cl_mem), (void*)&fpio.m_clObjectsBuffer);
+	ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 3, sizeof(cl_mem), (void*)&fpio.m_dAllOverlappingPairs);
+	ciErrNum = clSetKernelArg(fpio.m_broadphaseColorKernel, 4, sizeof(int), &fpio.m_numOverlap);
+
+
+	if (fpio.m_numOverlap)
+	{
+		size_t workGroupSize = 64;
+		size_t numWorkItems = workGroupSize*((fpio.m_numOverlap+ (workGroupSize)) / workGroupSize);
+				
+		ciErrNum = clEnqueueNDRangeKernel(fpio.m_cqCommandQue, fpio.m_broadphaseColorKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+}
+
+
+
+void releaseFindPairs(btFindPairsIO& fpio)
+{
+	clReleaseKernel(fpio.m_initializeGpuAabbsKernelSimple);
+	clReleaseKernel(fpio.m_initializeGpuAabbsKernelFull);
+	clReleaseKernel(fpio.m_broadphaseColorKernel);
+	clReleaseKernel(fpio.m_broadphaseBruteForceKernel);
+	clReleaseKernel(fpio.m_setupBodiesKernel);
+	clReleaseKernel(fpio.m_copyVelocitiesKernel);
+
+
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/findPairsOpenCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/findPairsOpenCL.h
@@ -0,0 +1,90 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Roman Ponomarev, Erwin Coumans
+
+#ifndef FIND_PAIRS_H
+#define FIND_PAIRS_H
+
+#include "../basic_initialize/btOpenCLInclude.h"
+
+struct btKernelInfo
+{
+	int			m_Id;
+	cl_kernel	m_kernel;
+	char*		m_name;
+	int			m_workgroupSize;
+};
+
+
+
+struct btFindPairsIO
+{
+	int				m_numObjects;
+
+	cl_mem			m_clObjectsBuffer; //for memory layout details see main.cpp (todo, make it flexible)
+	int				m_positionOffset;//offset in m_clObjectsBuffer where position array starts
+
+	cl_command_queue			m_cqCommandQue;
+	cl_kernel		m_initializeGpuAabbsKernelSimple;
+	cl_kernel		m_initializeGpuAabbsKernelFull;
+	cl_kernel	m_broadphaseColorKernel;
+	cl_kernel	m_broadphaseBruteForceKernel;
+
+	cl_kernel	m_setupBodiesKernel;
+	cl_kernel	m_copyVelocitiesKernel;
+
+	cl_context		m_mainContext;
+	cl_device_id	m_device;
+
+	cl_kernel		m_calcHashAabbKernel;
+	cl_kernel		m_clearCellStartKernel;
+	cl_kernel		m_findCellStartKernel;
+	cl_kernel		m_findOverlappingPairsKernel;
+	cl_kernel		m_computePairChangeKernel;
+	cl_kernel		m_squeezePairBuffKernel;
+
+
+	cl_mem m_dAllOverlappingPairs;
+	int m_numOverlap;
+
+	cl_mem					m_dBpParams;
+	cl_mem					m_dBodiesHash;
+	cl_mem					m_dCellStart;
+	cl_mem					m_dPairBuff; 
+	cl_mem					m_dPairBuffStartCurr;
+	cl_mem					m_dlocalShapeAABB;
+	cl_mem					m_dAABB;
+	cl_mem					m_dPairScan;
+	cl_mem					m_dPairOut;
+};
+
+
+void initFindPairs(btFindPairsIO& fpio,cl_context cxMainContext, cl_device_id device, cl_command_queue commandQueue, int maxHandles,int maxPairsPerBody = 16);
+
+void	findPairsOpenCLBruteForce(btFindPairsIO& fpio);
+
+void	setupGpuAabbsSimple(btFindPairsIO& fpio);
+
+void	setupGpuAabbsFull(btFindPairsIO& fpio, cl_mem bodies);
+
+
+void	colorPairsOpenCL(btFindPairsIO&	fpio);
+
+void	setupBodies(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias);
+void	copyBodyVelocities(btFindPairsIO& fpio, cl_mem linVelMem, cl_mem angVelMem, cl_mem bodies, cl_mem bodyInertias);
+
+void releaseFindPairs(btFindPairsIO& fpio);
+
+#endif //FIND_PAIRS_H
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/integrateKernel.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/integrateKernel.cl
@@ -0,0 +1,116 @@
+MSTRINGIFY(
+
+float4 quatMult(float4 q1, float4 q2)
+{
+	float4 q;
+	q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;
+	q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;
+	q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;
+	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; 
+	return q;
+}
+
+float4 quatNorm(float4 q)
+{
+	float len = native_sqrt(dot(q, q));
+	if(len > 0.f)
+	{
+		q *= 1.f / len;
+	}
+	else
+	{
+		q.x = q.y = q.z = 0.f;
+		q.w = 1.f;
+	}
+	return q;
+}
+
+
+
+
+
+__kernel void 
+  integrateTransformsKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
+		   __global float4 *linVel,
+		   __global float4 *pAngVel,
+		   __global float* pBodyTimes)
+{
+	int nodeID = get_global_id(0);
+	
+	
+	
+	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
+	float mAmplitude = 66.f;
+	float timeStep = 0.0166666f;
+	
+	if( nodeID < numNodes )
+	{
+	
+		//g_vertexBuffer[nodeID + startOffset/4+numNodes] += pAngVel[nodeID];
+		if (1)
+		{
+			float4 axis;
+			//add some hardcoded angular damping
+			pAngVel[nodeID].x *= 0.99f;
+			pAngVel[nodeID].y *= 0.99f;
+			pAngVel[nodeID].z *= 0.99f;
+			
+			float4 angvel = pAngVel[nodeID];
+			float fAngle = native_sqrt(dot(angvel, angvel));
+			//limit the angular motion
+			if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
+			{
+				fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
+			}
+			if(fAngle < 0.001f)
+			{
+				// use Taylor's expansions of sync function
+				axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
+			}
+			else
+			{
+				// sync(fAngle) = sin(c*fAngle)/t
+				axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);
+			}
+			float4 dorn = axis;
+			dorn.w = native_cos(fAngle * timeStep * 0.5f);
+			float4 orn0 = g_vertexBuffer[nodeID + startOffset/4+numNodes];
+			float4 predictedOrn = quatMult(dorn, orn0);
+			predictedOrn = quatNorm(predictedOrn);
+			g_vertexBuffer[nodeID + startOffset/4+numNodes]=predictedOrn;
+		}
+
+	//linear velocity		
+		g_vertexBuffer[nodeID + startOffset/4] +=  linVel[nodeID] * timeStep;
+		
+	}
+}
+
+
+__kernel void 
+  sineWaveKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
+		   __global float4 *linVel,
+		   __global float4 *pAngVel,
+		   __global float* pBodyTimes)
+{
+	int nodeID = get_global_id(0);
+	float timeStepPos = 0.000166666;
+	
+	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254f);
+	float mAmplitude = 166.f;
+	
+	
+	if( nodeID < numNodes )
+	{
+		pBodyTimes[nodeID] += timeStepPos;
+		float4 position = g_vertexBuffer[nodeID + startOffset/4];
+		position.x = native_cos(pBodyTimes[nodeID]*2.17f)*mAmplitude + native_sin(pBodyTimes[nodeID])*mAmplitude*0.5f;
+		position.y = native_cos(pBodyTimes[nodeID]*1.38f)*mAmplitude + native_sin(pBodyTimes[nodeID]*mAmplitude);
+		position.z = native_cos(pBodyTimes[nodeID]*2.17f)*mAmplitude + native_sin(pBodyTimes[nodeID]*0.777f)*mAmplitude;
+		g_vertexBuffer[nodeID + startOffset/4] = position;
+	}
+}
+
+
+
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/main.cpp
--- a/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/broadphase_benchmark/premake4.lua
@@ -0,0 +1,5 @@
+
+	include "AMD"
+	include "Intel"
+	include "NVIDIA"
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/global_atomics/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/global_atomics/AMD/premake4.lua
@@ -0,0 +1,23 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_global_atomics_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+--		includedirs {"..","../../../../include/gpu_research"}
+		
+		files {
+			"../main.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/global_atomics/globalAtomicsKernel.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/global_atomics/globalAtomicsKernel.h
@@ -0,0 +1,36 @@
+static const char* globalAtomicsKernelString= \
+"\n"
+"\n"
+"\n"
+"\n"
+"//OpenCL 1.1 has atomic_inc build-in (no extension needed)\n"
+"//see http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/atomic_inc.html\n"
+"__kernel void  globalAtomicKernelOpenCL1_1( volatile __global int* counter)\n"
+"{\n"
+"	atomic_inc(counter);\n"
+"}\n"
+"\n"
+"//OpenCL 1.1 atomic device counters extension, usually faster on current AMD hardware\n"
+"//http://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"__kernel void  counterAtomicKernelExt( counter32_t counter)\n"
+"{\n"
+"	atomic_inc(counter);\n"
+"}\n"
+"\n"
+"\n"
+"//OpenCL 1.0 optional extension, using atom_inc\n"
+"//see http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/cl_khr_global_int32_base_atomics.html\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable //atomic_inc\n"
+"__kernel void  globalAtomicKernelExt( __global int* counter)\n"
+"{\n"
+"	atom_inc(counter);\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void  globalAtomicKernelCounters32Broken( __global int* counter)\n"
+"{\n"
+"	(*counter)++;\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/global_atomics/global_atomics.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/global_atomics/global_atomics.cl
@@ -0,0 +1,34 @@
+
+
+
+
+//OpenCL 1.1 has atomic_inc build-in (no extension needed)
+//see http://www.khronos.org/registry/cl/sdk/1.1/docs/man/xhtml/atomic_inc.html
+__kernel void  globalAtomicKernelOpenCL1_1( volatile __global int* counter)
+{
+	atomic_inc(counter);
+}
+
+//OpenCL 1.1 atomic device counters extension, usually faster on current AMD hardware
+//http://www.khronos.org/registry/cl/extensions/ext/cl_ext_atomic_counters_32.txt
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+__kernel void  counterAtomicKernelExt( counter32_t counter)
+{
+	atomic_inc(counter);
+}
+
+
+//OpenCL 1.0 optional extension, using atom_inc
+//see http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/cl_khr_global_int32_base_atomics.html
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable //atomic_inc
+__kernel void  globalAtomicKernelExt( __global int* counter)
+{
+	atom_inc(counter);
+}
+
+
+__kernel void  globalAtomicKernelCounters32Broken( __global int* counter)
+{
+	(*counter)++;
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/global_atomics/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/global_atomics/main.cpp
@@ -0,0 +1,201 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "../basic_initialize/btOpenCLUtils.h"
+#include <stdio.h>
+
+cl_context			g_cxMainContext;
+cl_command_queue	g_cqCommandQue;
+cl_kernel			g_atomicsKernel;
+static const size_t workGroupSize = 128;//todo figure out an appropriate workgroup size suitable for the OpenCL platform/context/device/kernel
+#define NUM_OBJECTS 1024
+
+#include "globalAtomicsKernel.h"
+
+
+char * findAndReplace(   char const * const original,     char const * const pattern,     char const * const replacement);
+
+
+#include <string.h>
+#include <malloc.h>
+
+
+int main(int argc, char* argv[])
+{
+	int ciErrNum = 0;
+	
+	printf("press a key to start\n");
+	getchar();
+
+	const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
+	printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
+
+	cl_device_type  deviceType = CL_DEVICE_TYPE_GPU;//CL_DEVICE_TYPE_ALL
+	
+	void* glCtx=0;
+	void* glDC = 0;
+	printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
+	g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
+
+	if (numDev>0)
+	{
+		int deviceIndex=0;
+
+		cl_device_id		device;
+		device = btOpenCLUtils::getDevice(g_cxMainContext,deviceIndex);
+		btOpenCLDeviceInfo clInfo;
+		btOpenCLUtils::getDeviceInfo(device,clInfo);
+		btOpenCLUtils::printDeviceInfo(device);
+
+
+		const char* globalAtomicsKernelStringPatched = globalAtomicsKernelString;
+		if (!strstr(clInfo.m_deviceExtensions,"cl_ext_atomic_counters_32"))
+		{
+			globalAtomicsKernelStringPatched = findAndReplace(globalAtomicsKernelString,"counter32_t", "volatile __global int*");
+		}
+
+		
+
+		// create a command-queue
+		g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+		
+		cl_mem counterBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, sizeof(int), NULL, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+		char* kernelMethods[] = 
+		{
+			"globalAtomicKernelOpenCL1_1",
+			"counterAtomicKernelExt",
+			"globalAtomicKernelExt",
+			"globalAtomicKernelCounters32Broken"
+		};
+		int numKernelMethods = sizeof(kernelMethods)/sizeof(char*);
+
+		for (int i=0;i<numKernelMethods;i++)
+		{
+			int myCounter = 0;
+
+			//write to counterBuffer
+			int deviceOffset=0;
+			int hostOffset=0;
+
+			ciErrNum = clEnqueueWriteBuffer(g_cqCommandQue, counterBuffer,CL_FALSE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+			g_atomicsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext,device,globalAtomicsKernelStringPatched,kernelMethods[i], &ciErrNum);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+		
+
+
+			ciErrNum = clSetKernelArg(g_atomicsKernel, 0, sizeof(cl_mem),(void*)&counterBuffer);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+			size_t	numWorkItems = workGroupSize*((NUM_OBJECTS + (workGroupSize-1)) / workGroupSize);
+			ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_atomicsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			
+			clFinish(g_cqCommandQue);
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+			//read from counterBuffer
+			ciErrNum = clEnqueueReadBuffer(g_cqCommandQue, counterBuffer, CL_TRUE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
+			 oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+			 if (myCounter != NUM_OBJECTS)
+			 {
+				 printf("%s is broken, expected %d got %d\n",kernelMethods[i],NUM_OBJECTS,myCounter);
+			 } else
+			 {
+				 printf("%s success, got %d\n",kernelMethods[i],myCounter);
+			 }
+		}
+
+		clReleaseCommandQueue(g_cqCommandQue);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+
+	clReleaseContext(g_cxMainContext);
+	
+	printf("press a key to end\n");
+	getchar();
+
+	return 0;
+}
+
+
+#ifdef _WIN32
+#pragma warning( push )
+#pragma warning( disable : 4996 )
+#endif //_WIN32
+
+#include <string.h>
+#include <stdlib.h>
+
+char * findAndReplace(
+    char const * const original, 
+    char const * const pattern, 
+    char const * const replacement
+) {
+  size_t const replen = strlen(replacement);
+  size_t const patlen = strlen(pattern);
+  size_t const orilen = strlen(original);
+
+  size_t patcnt = 0;
+  const char * oriptr;
+  const char * patloc;
+
+  // find how many times the pattern occurs in the original string
+  for (oriptr = original; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+  {
+    patcnt++;
+  }
+
+  {
+    // allocate memory for the new string
+    size_t const retlen = orilen + patcnt * (replen - patlen);
+    char * const returned = (char *) malloc( sizeof(char) * (retlen + 1) );
+
+    if (returned != NULL)
+    {
+      // copy the original string, 
+      // replacing all the instances of the pattern
+      char * retptr = returned;
+      for (oriptr = original; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+      {
+        size_t const skplen = patloc - oriptr;
+        // copy the section until the occurence of the pattern
+        strncpy(retptr, oriptr, skplen);
+        retptr += skplen;
+        // copy the replacement 
+        strncpy(retptr, replacement, replen);
+        retptr += replen;
+      }
+      // copy the rest of the string.
+      strcpy(retptr, oriptr);
+    }
+    return returned;
+  }
+}
+
+#ifdef _WIN32
+#pragma warning( pop )
+#endif //_WIN32
--- a/Extras/RigidBodyGpuPipeline/opencl/global_atomics/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/global_atomics/premake4.lua
@@ -0,0 +1,4 @@
+
+	include "AMD"
+	--include "Intel"
+	--include "NVIDIA"
--- a/Extras/RigidBodyGpuPipeline/opencl/global_atomics/stringify.py
+++ b/Extras/RigidBodyGpuPipeline/opencl/global_atomics/stringify.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+import sys
+import os
+import shutil
+
+arg = sys.argv[1]
+fh = open(arg)
+	
+print 'static const char* '+sys.argv[2]+'= \\'
+for line in fh.readlines():
+	a = line.strip('\n')
+	print '"'+a+'\\n"'
+print ';'
--- a/Extras/RigidBodyGpuPipeline/opencl/global_atomics/stringifykernels.bat
+++ b/Extras/RigidBodyGpuPipeline/opencl/global_atomics/stringifykernels.bat
@@ -0,0 +1,5 @@
+stringify.py global_atomics.cl globalAtomicsKernelString >globalAtomicsKernel.h
+
+
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/AMD/premake4.lua
@@ -0,0 +1,58 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_gpu_rigidbody_pipeline_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../btConvexUtility.cpp",
+			"../btConvexUtility.h",
+			"../btGpuNarrowPhaseAndSolver.cpp",
+			"../btGpuNarrowPhaseAndSolver.h",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
+			"../../../../../src/LinearMath/btConvexHullComputer.cpp",
+			"../../../../../src/LinearMath/btConvexHullComputer.h",
+			"../../broadphase_benchmark/findPairsOpenCL.cpp",
+			"../../broadphase_benchmark/findPairsOpenCL.h",
+			"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
+			"../../broadphase_benchmark/btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/CommandLineArgs.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/CommandLineArgs.h
@@ -0,0 +1,91 @@
+#ifndef COMMAND_LINE_ARGS_H
+#define COMMAND_LINE_ARGS_H
+
+/******************************************************************************
+ * Command-line parsing
+ ******************************************************************************/
+#include <map>
+#include <algorithm>
+#include <string>
+#include <sstream>
+class CommandLineArgs
+{
+protected:
+
+	std::map<std::string, std::string> pairs;
+
+public:
+
+	// Constructor
+	CommandLineArgs(int argc, char **argv)
+	{
+		using namespace std;
+
+	    for (int i = 1; i < argc; i++)
+	    {
+	        string arg = argv[i];
+
+	        if ((arg[0] != '-') || (arg[1] != '-')) {
+	        	continue;
+	        }
+
+        	string::size_type pos;
+		    string key, val;
+	        if ((pos = arg.find( '=')) == string::npos) {
+	        	key = string(arg, 2, arg.length() - 2);
+	        	val = "";
+	        } else {
+	        	key = string(arg, 2, pos - 2);
+	        	val = string(arg, pos + 1, arg.length() - 1);
+	        }
+        	pairs[key] = val;
+	    }
+	}
+
+	bool CheckCmdLineFlag(const char* arg_name)
+	{
+		using namespace std;
+		map<string, string>::iterator itr;
+		if ((itr = pairs.find(arg_name)) != pairs.end()) {
+			return true;
+	    }
+		return false;
+	}
+
+	template <typename T>
+	void GetCmdLineArgument(const char *arg_name, T &val);
+
+	int ParsedArgc()
+	{
+		return pairs.size();
+	}
+};
+
+template <typename T>
+void CommandLineArgs::GetCmdLineArgument(const char *arg_name, T &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+		istringstream strstream(itr->second);
+		strstream >> val;
+    }
+}
+
+template <>
+void CommandLineArgs::GetCmdLineArgument<char*>(const char* arg_name, char* &val)
+{
+	using namespace std;
+	map<string, string>::iterator itr;
+	if ((itr = pairs.find(arg_name)) != pairs.end()) {
+
+		string s = itr->second;
+		val = (char*) malloc(sizeof(char) * (s.length() + 1));
+		strcpy(val, s.c_str());
+
+	} else {
+    	val = NULL;
+	}
+}
+
+#endif //COMMAND_LINE_ARGS_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/Intel/premake4.lua
@@ -0,0 +1,58 @@
+	
+	hasCL = findOpenCL_Intel()
+	
+	if (hasCL) then
+
+		project "OpenCL_gpu_rigidbody_pipeline_Intel"
+
+		initOpenCL_Intel()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../btConvexUtility.cpp",
+			"../btConvexUtility.h",
+			"../btGpuNarrowPhaseAndSolver.cpp",
+			"../btGpuNarrowPhaseAndSolver.h",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
+			"../../../../../src/LinearMath/btConvexHullComputer.cpp",
+			"../../../../../src/LinearMath/btConvexHullComputer.h",
+			"../../broadphase_benchmark/findPairsOpenCL.cpp",
+			"../../broadphase_benchmark/findPairsOpenCL.h",
+			"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
+			"../../broadphase_benchmark/btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/NVIDIA/premake4.lua
@@ -0,0 +1,57 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_gpu_rigidbody_pipeline_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../btConvexUtility.cpp",
+			"../btConvexUtility.h",
+			"../btGpuNarrowPhaseAndSolver.cpp",
+			"../btGpuNarrowPhaseAndSolver.h",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
+			"../../../../../src/LinearMath/btConvexHullComputer.cpp",
+			"../../../../../src/LinearMath/btConvexHullComputer.h",
+			"../../broadphase_benchmark/findPairsOpenCL.cpp",
+			"../../broadphase_benchmark/findPairsOpenCL.h",
+			"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
+			"../../broadphase_benchmark/btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btConvexUtility.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btConvexUtility.cpp
@@ -0,0 +1,240 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+#include "btConvexUtility.h"
+#include "LinearMath/btConvexHullComputer.h"
+#include "LinearMath/btGrahamScan2dConvexHull.h"
+#include "LinearMath/btQuaternion.h"
+
+bool	btConvexUtility::initializePolyhedralFeatures(const btAlignedObjectArray<btVector3>& orgVertices, bool mergeCoplanarTriangles)
+{
+	
+
+	btConvexHullComputer conv;
+	conv.compute(&orgVertices[0].getX(), sizeof(btVector3),orgVertices.size(),0.f,0.f);
+
+	btAlignedObjectArray<btVector3> faceNormals;
+	int numFaces = conv.faces.size();
+	faceNormals.resize(numFaces);
+	btConvexHullComputer* convexUtil = &conv;
+
+	
+	btAlignedObjectArray<btFace>	tmpFaces;
+	tmpFaces.resize(numFaces);
+
+	int numVertices = convexUtil->vertices.size();
+	m_vertices.resize(numVertices);
+	for (int p=0;p<numVertices;p++)
+	{
+		m_vertices[p] = convexUtil->vertices[p];
+	}
+
+
+	for (int i=0;i<numFaces;i++)
+	{
+		int face = convexUtil->faces[i];
+		//printf("face=%d\n",face);
+		const btConvexHullComputer::Edge*  firstEdge = &convexUtil->edges[face];
+		const btConvexHullComputer::Edge*  edge = firstEdge;
+
+		btVector3 edges[3];
+		int numEdges = 0;
+		//compute face normals
+
+		btScalar maxCross2 = 0.f;
+		int chosenEdge = -1;
+
+		do
+		{
+			
+			int src = edge->getSourceVertex();
+			tmpFaces[i].m_indices.push_back(src);
+			int targ = edge->getTargetVertex();
+			btVector3 wa = convexUtil->vertices[src];
+
+			btVector3 wb = convexUtil->vertices[targ];
+			btVector3 newEdge = wb-wa;
+			newEdge.normalize();
+			if (numEdges<2)
+				edges[numEdges++] = newEdge;
+
+			edge = edge->getNextEdgeOfFace();
+		} while (edge!=firstEdge);
+
+		btScalar planeEq = 1e30f;
+
+		
+		if (numEdges==2)
+		{
+			faceNormals[i] = edges[0].cross(edges[1]);
+			faceNormals[i].normalize();
+			tmpFaces[i].m_plane[0] = faceNormals[i].getX();
+			tmpFaces[i].m_plane[1] = faceNormals[i].getY();
+			tmpFaces[i].m_plane[2] = faceNormals[i].getZ();
+			tmpFaces[i].m_plane[3] = planeEq;
+
+		}
+		else
+		{
+			btAssert(0);//degenerate?
+			faceNormals[i].setZero();
+		}
+
+		for (int v=0;v<tmpFaces[i].m_indices.size();v++)
+		{
+			btScalar eq = m_vertices[tmpFaces[i].m_indices[v]].dot(faceNormals[i]);
+			if (planeEq>eq)
+			{
+				planeEq=eq;
+			}
+		}
+		tmpFaces[i].m_plane[3] = -planeEq;
+	}
+
+	//merge coplanar faces
+
+	btScalar faceWeldThreshold= 0.999f;
+	btAlignedObjectArray<int> todoFaces;
+	for (int i=0;i<tmpFaces.size();i++)
+		todoFaces.push_back(i);
+
+	while (todoFaces.size())
+	{
+		btAlignedObjectArray<int> coplanarFaceGroup;
+		int refFace = todoFaces[todoFaces.size()-1];
+
+		coplanarFaceGroup.push_back(refFace);
+		btFace& faceA = tmpFaces[refFace];
+		todoFaces.pop_back();
+
+		btVector3 faceNormalA(faceA.m_plane[0],faceA.m_plane[1],faceA.m_plane[2]);
+		for (int j=todoFaces.size()-1;j>=0;j--)
+		{
+			int i = todoFaces[j];
+			btFace& faceB = tmpFaces[i];
+			btVector3 faceNormalB(faceB.m_plane[0],faceB.m_plane[1],faceB.m_plane[2]);
+			if (faceNormalA.dot(faceNormalB)>faceWeldThreshold)
+			{
+				coplanarFaceGroup.push_back(i);
+				todoFaces.remove(i);
+			}
+		}
+
+
+		bool did_merge = false;
+		if (mergeCoplanarTriangles && coplanarFaceGroup.size()>1)
+		{
+			//do the merge: use Graham Scan 2d convex hull
+
+			btAlignedObjectArray<GrahamVector2> orgpoints;
+
+			for (int i=0;i<coplanarFaceGroup.size();i++)
+			{
+
+				btFace& face = tmpFaces[coplanarFaceGroup[i]];
+				btVector3 faceNormal(face.m_plane[0],face.m_plane[1],face.m_plane[2]);
+				btVector3 xyPlaneNormal(0,0,1);
+
+				btQuaternion rotationArc = shortestArcQuat(faceNormal,xyPlaneNormal);
+				
+				for (int f=0;f<face.m_indices.size();f++)
+				{
+					int orgIndex = face.m_indices[f];
+					btVector3 pt = m_vertices[orgIndex];
+					btVector3 rotatedPt =  quatRotate(rotationArc,pt);
+					rotatedPt.setZ(0);
+					bool found = false;
+
+					for (int i=0;i<orgpoints.size();i++)
+					{
+						//if ((orgpoints[i].m_orgIndex == orgIndex) || ((rotatedPt-orgpoints[i]).length2()<0.0001))
+						if (orgpoints[i].m_orgIndex == orgIndex)
+						{
+							found=true;
+							break;
+						}
+					}
+					if (!found)
+						orgpoints.push_back(GrahamVector2(rotatedPt,orgIndex));
+				}
+			}
+
+			btFace combinedFace;
+			for (int i=0;i<4;i++)
+				combinedFace.m_plane[i] = tmpFaces[coplanarFaceGroup[0]].m_plane[i];
+
+			btAlignedObjectArray<GrahamVector2> hull;
+			GrahamScanConvexHull2D(orgpoints,hull);
+
+			for (int i=0;i<hull.size();i++)
+			{
+				combinedFace.m_indices.push_back(hull[i].m_orgIndex);
+				for(int k = 0; k < orgpoints.size(); k++) {
+					if(orgpoints[k].m_orgIndex == hull[i].m_orgIndex) {
+						orgpoints[k].m_orgIndex = -1; // invalidate...
+						break;
+			}
+				}
+			}
+			// are there rejected vertices?
+			bool reject_merge = false;
+			for(int i = 0; i < orgpoints.size(); i++) {
+				if(orgpoints[i].m_orgIndex == -1)
+					continue; // this is in the hull...
+				// this vertex is rejected -- is anybody else using this vertex?
+				for(int j = 0; j < tmpFaces.size(); j++) {
+					btFace& face = tmpFaces[j];
+					// is this a face of the current coplanar group?
+					bool is_in_current_group = false;
+					for(int k = 0; k < coplanarFaceGroup.size(); k++) {
+						if(coplanarFaceGroup[k] == j) {
+							is_in_current_group = true;
+							break;
+						}
+					}
+					if(is_in_current_group) // ignore this face...
+						continue;
+					// does this face use this rejected vertex?
+					for(int v = 0; v < face.m_indices.size(); v++) {
+						if(face.m_indices[v] == orgpoints[i].m_orgIndex) {
+							// this rejected vertex is used in another face -- reject merge
+							reject_merge = true;
+							break;
+						}
+					}
+					if(reject_merge)
+						break;
+				}
+				if(reject_merge)
+					break;
+			}
+			if(!reject_merge) {
+				// do this merge!
+				did_merge = true;
+			m_faces.push_back(combinedFace);
+			}
+		}
+		if(!did_merge)
+		{
+			for (int i=0;i<coplanarFaceGroup.size();i++)
+			{
+				m_faces.push_back(tmpFaces[coplanarFaceGroup[i]]);
+			}
+		}
+
+	}
+	return true;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btConvexUtility.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btConvexUtility.h
@@ -0,0 +1,41 @@
+
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef _BT_CONVEX_UTILITY_H
+#define _BT_CONVEX_UTILITY_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+#include "LinearMath/btVector3.h"
+
+struct btFace
+{
+	btAlignedObjectArray<int>	m_indices;
+//	btAlignedObjectArray<int>	m_connectedFaces;
+	btScalar	m_plane[4];
+};
+
+class btConvexUtility
+{
+	public:
+		
+	btAlignedObjectArray<btVector3>	m_vertices;
+	btAlignedObjectArray<btFace>	m_faces;
+	
+	bool	initializePolyhedralFeatures(const btAlignedObjectArray<btVector3>& orgVertices, bool mergeCoplanarTriangles);
+		
+};
+#endif
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.cpp
@@ -0,0 +1,730 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "btGpuNarrowphaseAndSolver.h"
+
+//#include "CustomConvexShape.h"
+//#include "CustomConvexPairCollision.h"
+#include "LinearMath/btQuickprof.h"
+
+
+//#include "BulletDynamics/Dynamics/btRigidBody.h"
+
+#include "Adl/Adl.h"
+#include "../../dynamics/basic_demo/Stubs/AdlMath.h"
+#include "../../dynamics/basic_demo/Stubs/AdlContact4.h"
+#include "../../dynamics/basic_demo/Stubs/AdlQuaternion.h"
+#include "../../dynamics/basic_demo/Stubs/ChNarrowPhase.h"
+#include "../../dynamics/basic_demo/Stubs/Solver.h"
+#include <AdlPrimitives/Sort/RadixSort32.h>
+
+int gpuBatchContacts = 1;
+
+int numPairsOut =0;
+struct CPUSolveData
+{
+	u32 m_n[adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT];
+	u32 m_offset[adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT];
+};
+
+
+struct ParallelSolveData
+{
+	adl::Buffer<u32>* m_numConstraints;
+	adl::Buffer<u32>* m_offsets;
+};
+
+struct	CustomDispatchData
+{
+	adl::DeviceCL* m_deviceCL;
+	adl::Device* m_deviceHost;
+	ShapeDataType m_ShapeBuffer;
+	adl::HostBuffer<ConvexHeightField*>* m_shapePointers;
+
+	adl::HostBuffer<int2>* m_pBufPairsCPU;
+
+	adl::Buffer<int2>* m_convexPairsOutGPU;
+	adl::Buffer<int2>* m_planePairs;
+
+	adl::Buffer<Contact4>* m_pBufContactOutGPU;
+	adl::HostBuffer<Contact4>* m_pBufContactOutCPU;
+	adl::ChNarrowphase<adl::TYPE_CL>::Data* m_Data;
+	
+
+
+	adl::HostBuffer<RigidBodyBase::Body>* m_bodyBufferCPU;
+	adl::Buffer<RigidBodyBase::Body>* m_bodyBufferGPU;
+
+	adl::Buffer<RigidBodyBase::Inertia>*	m_inertiaBufferCPU;
+	adl::Buffer<RigidBodyBase::Inertia>*	m_inertiaBufferGPU;
+
+	adl::Solver<adl::TYPE_CL>::Data* m_solverDataGPU;
+	SolverData		m_contactCGPU;
+	void*			m_frictionCGPU;
+
+	int m_numAcceleratedShapes;
+	int m_numAcceleratedRigidBodies;
+};
+
+
+btGpuNarrowphaseAndSolver::btGpuNarrowphaseAndSolver(adl::DeviceCL* deviceCL)
+	:m_internalData(0) ,m_planeBodyIndex(-1)
+{
+
+	if (deviceCL)
+	{
+		m_internalData = new CustomDispatchData();
+		memset(m_internalData,0,sizeof(CustomDispatchData));
+
+		adl::DeviceUtils::Config cfg;
+		m_internalData->m_deviceCL = deviceCL;
+
+
+		m_internalData->m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
+		m_internalData->m_pBufPairsCPU = new adl::HostBuffer<int2>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
+
+		m_internalData->m_convexPairsOutGPU = new adl::Buffer<int2>(m_internalData->m_deviceCL,MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_planePairs = new adl::Buffer<int2>(m_internalData->m_deviceCL,MAX_BROADPHASE_COLLISION_CL);
+		
+		m_internalData->m_pBufContactOutCPU = new adl::HostBuffer<Contact4>(m_internalData->m_deviceHost, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_bodyBufferCPU = new adl::HostBuffer<RigidBodyBase::Body>(m_internalData->m_deviceHost, MAX_CONVEX_BODIES_CL);
+
+		m_internalData->m_inertiaBufferCPU = new adl::Buffer<RigidBodyBase::Inertia>(m_internalData->m_deviceHost,MAX_CONVEX_BODIES_CL);
+		m_internalData->m_pBufContactOutGPU = new adl::Buffer<Contact4>(m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_inertiaBufferGPU = new adl::Buffer<RigidBodyBase::Inertia>(m_internalData->m_deviceCL,MAX_CONVEX_BODIES_CL);
+
+		m_internalData->m_solverDataGPU = adl::Solver<adl::TYPE_CL>::allocate( m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_bodyBufferGPU = new adl::Buffer<RigidBodyBase::Body>(m_internalData->m_deviceCL, MAX_CONVEX_BODIES_CL);
+		m_internalData->m_Data = adl::ChNarrowphase<adl::TYPE_CL>::allocate(m_internalData->m_deviceCL);
+//		m_internalData->m_DataCPU = adl::ChNarrowphase<adl::TYPE_HOST>::allocate(m_internalData->m_deviceHost);
+		
+
+		m_internalData->m_ShapeBuffer = adl::ChNarrowphase<adl::TYPE_CL>::allocateShapeBuffer(m_internalData->m_deviceCL, MAX_CONVEX_SHAPES_CL);	
+
+		m_internalData->m_shapePointers = new adl::HostBuffer<ConvexHeightField*>(m_internalData->m_deviceHost,MAX_CONVEX_SHAPES_CL);
+
+		m_internalData->m_numAcceleratedShapes = 0;
+		m_internalData->m_numAcceleratedRigidBodies = 0;
+
+		m_internalData->m_contactCGPU = adl::Solver<adl::TYPE_CL>::allocateConstraint4( m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
+		m_internalData->m_frictionCGPU = adl::Solver<adl::TYPE_CL>::allocateFrictionConstraint( m_internalData->m_deviceCL, MAX_BROADPHASE_COLLISION_CL);
+
+	}
+}
+
+int btGpuNarrowphaseAndSolver::registerShape(ConvexHeightField* convexShape)
+{
+	(*m_internalData->m_shapePointers)[m_internalData->m_numAcceleratedShapes] = convexShape;
+	adl::ChNarrowphase<adl::TYPE_CL>::setShape(m_internalData->m_ShapeBuffer, convexShape, m_internalData->m_numAcceleratedShapes, 0.01f);
+	return m_internalData->m_numAcceleratedShapes++;
+}
+
+cl_mem	btGpuNarrowphaseAndSolver::getBodiesGpu()
+{
+	return (cl_mem)m_internalData->m_bodyBufferGPU->m_ptr;
+}
+
+cl_mem	btGpuNarrowphaseAndSolver::getBodyInertiasGpu()
+{
+	return (cl_mem)m_internalData->m_inertiaBufferGPU->m_ptr;
+}
+
+
+int btGpuNarrowphaseAndSolver::registerRigidBody(int shapeIndex, float mass, const float* position, const float* orientation , bool writeToGpu)
+{
+	assert(m_internalData->m_numAcceleratedRigidBodies< (MAX_CONVEX_BODIES_CL-1));
+
+	RigidBodyBase::Body& body = m_internalData->m_bodyBufferCPU->m_ptr[m_internalData->m_numAcceleratedRigidBodies];
+
+	float friction = 1.f;
+	float restitution = 0.f;
+
+	body.m_frictionCoeff = friction;
+	body.m_restituitionCoeff = restitution;
+	body.m_angVel = make_float4(0.f);
+	body.m_linVel = make_float4(0.f);
+	body.m_pos = make_float4(position[0],position[1],position[2],0.f);
+	body.m_quat = make_float4(orientation[0],orientation[1],orientation[2],orientation[3]);
+	body.m_shapeIdx = shapeIndex;
+	if (shapeIndex<0)
+	{
+		body.m_shapeType = CollisionShape::SHAPE_PLANE;
+		m_planeBodyIndex = m_internalData->m_numAcceleratedRigidBodies;
+	} else
+	{
+		body.m_shapeType = CollisionShape::SHAPE_CONVEX_HEIGHT_FIELD;
+	}
+	
+	body.m_invMass = mass? 1.f/mass : 0.f;
+
+	if (writeToGpu)
+		m_internalData->m_bodyBufferGPU->write(&body,1,m_internalData->m_numAcceleratedRigidBodies);
+
+	RigidBodyBase::Inertia& shapeInfo = m_internalData->m_inertiaBufferCPU->m_ptr[m_internalData->m_numAcceleratedRigidBodies];
+
+	if (mass==0.f)
+	{
+		shapeInfo.m_initInvInertia = mtZero();
+		shapeInfo.m_invInertia = mtZero();
+	} else
+	{
+
+		assert(body.m_shapeIdx>=0);
+
+		//approximate using the aabb of the shape
+
+		Aabb aabb = (*m_internalData->m_shapePointers)[shapeIndex]->m_aabb;
+		float4 halfExtents = (aabb.m_max - aabb.m_min);
+
+		float4 localInertia;
+
+		float lx=2.f*halfExtents.x;
+		float ly=2.f*halfExtents.y;
+		float lz=2.f*halfExtents.z;
+
+		localInertia = make_float4( (mass/12.0f) * (ly*ly + lz*lz),
+			(mass/12.0f) * (lx*lx + lz*lz),
+			(mass/12.0f) * (lx*lx + ly*ly));
+
+		float4 invLocalInertia;
+		invLocalInertia.x = 1.f/localInertia.x;
+		invLocalInertia.y = 1.f/localInertia.y;
+		invLocalInertia.z = 1.f/localInertia.z;
+		invLocalInertia.w = 0.f;
+
+		shapeInfo.m_initInvInertia = mtZero();
+		shapeInfo.m_initInvInertia.m_row[0].x = invLocalInertia.x;
+		shapeInfo.m_initInvInertia.m_row[1].y = invLocalInertia.y;
+		shapeInfo.m_initInvInertia.m_row[2].z = invLocalInertia.z;
+
+		Matrix3x3 m = qtGetRotationMatrix( body.m_quat);
+		Matrix3x3 mT = mtTranspose( m );
+		shapeInfo.m_invInertia = mtMul( mtMul( m, shapeInfo.m_initInvInertia ), mT );
+
+	}
+
+	if (writeToGpu)
+		m_internalData->m_inertiaBufferGPU->write(&shapeInfo,1,m_internalData->m_numAcceleratedRigidBodies);
+	return m_internalData->m_numAcceleratedRigidBodies++;
+}
+
+void	btGpuNarrowphaseAndSolver::writeAllBodiesToGpu()
+{
+	m_internalData->m_bodyBufferGPU->write(m_internalData->m_bodyBufferCPU->m_ptr,m_internalData->m_numAcceleratedRigidBodies);
+	m_internalData->m_inertiaBufferGPU->write(	m_internalData->m_inertiaBufferCPU->m_ptr,m_internalData->m_numAcceleratedRigidBodies);
+}
+
+
+
+btGpuNarrowphaseAndSolver::~btGpuNarrowphaseAndSolver(void)
+{
+	if (m_internalData)
+	{
+		delete m_internalData->m_pBufPairsCPU;
+		delete m_internalData->m_convexPairsOutGPU;
+		delete m_internalData->m_planePairs;
+		delete m_internalData->m_pBufContactOutGPU;
+		delete m_internalData->m_inertiaBufferGPU;
+		delete m_internalData->m_pBufContactOutCPU;
+		delete m_internalData->m_shapePointers;
+		adl::ChNarrowphase<adl::TYPE_CL>::deallocateShapeBuffer(m_internalData->m_ShapeBuffer);
+		delete m_internalData->m_inertiaBufferCPU;
+		adl::Solver<adl::TYPE_CL>::deallocateConstraint4( m_internalData->m_contactCGPU );
+		adl::Solver<adl::TYPE_CL>::deallocateFrictionConstraint( m_internalData->m_frictionCGPU );
+
+		delete m_internalData->m_bodyBufferGPU;
+		adl::Solver<adl::TYPE_CL>::deallocate(	m_internalData->m_solverDataGPU);
+		delete m_internalData->m_bodyBufferCPU;
+		adl::ChNarrowphase<adl::TYPE_CL>::deallocate(m_internalData->m_Data);
+
+		
+
+		adl::DeviceUtils::deallocate(m_internalData->m_deviceHost);
+		
+		delete m_internalData;
+	}
+
+}
+
+
+
+
+
+void btGpuNarrowphaseAndSolver::computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs) 
+{
+
+	BT_PROFILE("computeContactsAndSolver");
+	bool bGPU = (m_internalData != 0);
+	int maxBodyIndex = m_internalData->m_numAcceleratedRigidBodies;
+
+	if (!maxBodyIndex)
+		return;
+	int numOfConvexRBodies = maxBodyIndex;
+
+	adl::ChNarrowphaseBase::Config cfgNP;
+	cfgNP.m_collisionMargin = 0.01f;
+	int nContactOut = 0;
+	//printf("convexPairsOut.m_size = %d\n",m_internalData->m_convexPairsOutGPU->m_size);
+
+
+	adl::Buffer<int2> broadphasePairsGPU;
+	broadphasePairsGPU.m_ptr = (int2*)broadphasePairs;
+	broadphasePairsGPU.m_size = numBroadphasePairs;
+	broadphasePairsGPU.m_device = m_internalData->m_deviceCL;
+
+
+	bool useCulling = true;
+	if (useCulling)
+	{
+		BT_PROFILE("ChNarrowphase::culling");
+		adl::DeviceUtils::waitForCompletion(m_internalData->m_deviceCL);
+
+		numPairsOut = adl::ChNarrowphase<adl::TYPE_CL>::culling(
+			m_internalData->m_Data, 
+			&broadphasePairsGPU, 
+			numBroadphasePairs,
+			m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer,
+			m_internalData->m_convexPairsOutGPU,
+			cfgNP);
+	}
+
+	{
+		BT_PROFILE("ChNarrowphase::execute");
+		if (useCulling)
+		{
+		
+			if (m_planeBodyIndex>=0)
+			{
+				BT_PROFILE("ChNarrowphase:: plane versus convex");
+				//todo: get rid of this dynamic allocation
+				int2* hostPairs = new int2[m_internalData->m_numAcceleratedRigidBodies-1];
+				int index=0;
+				for (int i=0;i<m_internalData->m_numAcceleratedRigidBodies;i++)
+				{
+					if (i!=m_planeBodyIndex)
+					{
+						hostPairs[index].x = m_planeBodyIndex;
+						hostPairs[index].y = i;
+						index++;
+					}
+				}
+				assert(m_internalData->m_numAcceleratedRigidBodies-1 == index);
+				m_internalData->m_planePairs->write(hostPairs,index);
+				adl::DeviceUtils::waitForCompletion(m_internalData->m_deviceCL);
+				delete[]hostPairs;
+				//convex versus plane
+				adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, m_internalData->m_planePairs, index, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, 
+					0,0,m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
+			}
+		
+			//convex versus convex
+			adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, m_internalData->m_convexPairsOutGPU,numPairsOut, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
+		} else
+		{
+			adl::ChNarrowphase<adl::TYPE_CL>::execute(m_internalData->m_Data, &broadphasePairsGPU, numBroadphasePairs, m_internalData->m_bodyBufferGPU, m_internalData->m_ShapeBuffer, m_internalData->m_pBufContactOutGPU, nContactOut, cfgNP);
+		}
+
+		adl::DeviceUtils::waitForCompletion(m_internalData->m_deviceCL);
+	}
+	
+	if (!nContactOut)
+		return;
+	
+	
+	bool useSolver = true;//true;//false;
+
+	if (useSolver)
+	{
+		float dt=1./60.;
+		adl::SolverBase::ConstraintCfg csCfg( dt );
+		csCfg.m_enableParallelSolve = true;
+		csCfg.m_averageExtent = 0.2f;//@TODO m_averageObjExtent;
+		csCfg.m_staticIdx = m_planeBodyIndex;
+
+		
+		bool exposeInternalBatchImplementation=true;
+
+		adl::Solver<adl::TYPE_HOST>::Data* cpuSolverData = 0;
+		if (exposeInternalBatchImplementation)
+		{
+			BT_PROFILE("Batching");
+
+			cpuSolverData = adl::Solver<adl::TYPE_HOST>::allocate( m_internalData->m_deviceHost, nContactOut);
+
+			adl::Buffer<Contact4>* contactsIn = m_internalData->m_pBufContactOutGPU;
+			const adl::Buffer<RigidBodyBase::Body>* bodyBuf = m_internalData->m_bodyBufferGPU;
+			void* additionalData = m_internalData->m_frictionCGPU;
+			const adl::Buffer<RigidBodyBase::Inertia>* shapeBuf = m_internalData->m_inertiaBufferGPU;
+			SolverData contactCOut = m_internalData->m_contactCGPU;
+			int nContacts = nContactOut;
+
+			bool useCPU=false;
+
+			if (useCPU)
+			{
+				BT_PROFILE("CPU batch");
+				{
+					BT_PROFILE("CPU sortContacts2");
+					sortContacts2( cpuSolverData, bodyBuf, contactsIn, additionalData, nContacts, csCfg );
+				}
+
+				CPUSolveData* dataCPU = (CPUSolveData*)cpuSolverData->m_parallelSolveData;
+				{
+					BT_PROFILE("CPU batchContacts2");
+
+					adl::Buffer<u32> n; n.setRawPtr( cpuSolverData->m_device, dataCPU->m_n, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+					adl::Buffer<u32> offsets; offsets.setRawPtr( cpuSolverData->m_device, dataCPU->m_offset, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+					batchContacts2( cpuSolverData, contactsIn, nContacts, &n, &offsets, csCfg.m_staticIdx );
+				}
+
+				{
+					BT_PROFILE("CPU convertToConstraints2");
+					convertToConstraints2( cpuSolverData, bodyBuf, shapeBuf, contactsIn, contactCOut, additionalData, nContacts, csCfg );
+				}
+
+				{
+					BT_PROFILE("CPU -> GPU copy");
+					ParallelSolveData* dataGPU = (ParallelSolveData*)m_internalData->m_solverDataGPU->m_parallelSolveData;
+					dataGPU->m_numConstraints->write(dataCPU->m_n,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+					dataGPU->m_offsets->write(dataCPU->m_offset,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+					adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
+				}
+
+			}
+			else
+			{
+				BT_PROFILE("GPU batch");
+
+				adl::Solver<adl::TYPE_CL>::Data* data = m_internalData->m_solverDataGPU;
+
+				{
+					if( data->m_contactBuffer )
+					{
+						if( data->m_contactBuffer->getSize() < nContacts )
+						{
+							BT_PROFILE("delete data->m_contactBuffer;");
+							delete data->m_contactBuffer;
+							data->m_contactBuffer = 0;
+						}
+					}
+					if( data->m_contactBuffer == 0 )
+					{
+						data->m_contactBuffer = new adl::Buffer<Contact4>( data->m_device, nContacts );
+					}
+
+					adl::Buffer<Contact4>* contactNative  = contactsIn;
+
+					ParallelSolveData* nativeSolveData = (ParallelSolveData*)data->m_parallelSolveData;
+
+					{
+
+						ADLASSERT( data->m_device->m_type == adl::TYPE_CL );
+						adl::Buffer<RigidBodyBase::Body>* bodyNative = adl::BufferUtils::map<adl::TYPE_CL, true>( data->m_device, bodyBuf );
+						adl::Buffer<Contact4>* contactNative = adl::BufferUtils::map<adl::TYPE_CL, true>( data->m_device, contactsIn );
+
+						const int sortAlignment = 512; // todo. get this out of sort
+						if( csCfg.m_enableParallelSolve )
+						{
+							ParallelSolveData* nativeSolveData = (ParallelSolveData*)data->m_parallelSolveData;
+
+							int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
+
+							adl::Buffer<u32>* countsNative = nativeSolveData->m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
+							adl::Buffer<u32>* offsetsNative = nativeSolveData->m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
+
+							{	//	2. set cell idx
+								BT_PROFILE("GPU set cell idx");
+								struct CB
+								{
+									int m_nContacts;
+									int m_staticIdx;
+									float m_scale;
+									int m_nSplit;
+								};
+
+								ADLASSERT( sortSize%64 == 0 );
+								CB cdata;
+								cdata.m_nContacts = nContacts;
+								cdata.m_staticIdx = csCfg.m_staticIdx;
+								cdata.m_scale = 1.f/(adl::SolverBase::N_OBJ_PER_SPLIT*csCfg.m_averageExtent);
+								cdata.m_nSplit = adl::SolverBase::N_SPLIT;
+
+								adl::Buffer<CB> constBuffer( data->m_device, 1, adl::BufferBase::BUFFER_CONST );
+								adl::Launcher::BufferInfo bInfo[] = { adl::Launcher::BufferInfo( contactNative ), adl::Launcher::BufferInfo( bodyNative ), adl::Launcher::BufferInfo( data->m_sortDataBuffer ) };
+								adl::Launcher launcher( data->m_device, data->m_setSortDataKernel );
+								launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(adl::Launcher::BufferInfo) );
+								launcher.setConst( constBuffer, cdata );
+								launcher.launch1D( sortSize, 64 );
+							}
+							bool gpuRadixSort=true;
+							if (gpuRadixSort)
+							{	//	3. sort by cell idx
+								BT_PROFILE("gpuRadixSort");
+								int n = adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT;
+								int sortBit = 32;
+								//if( n <= 0xffff ) sortBit = 16;
+								//if( n <= 0xff ) sortBit = 8;
+								//adl::RadixSort<adl::TYPE_CL>::execute( data->m_sort, *data->m_sortDataBuffer, sortSize );
+								adl::RadixSort32<adl::TYPE_CL>::execute( data->m_sort32, *data->m_sortDataBuffer, sortSize );
+
+							} else
+							{
+								BT_PROFILE("cpu RadixSort");
+								adl::HostBuffer<adl::SortData> sortData(m_internalData->m_deviceHost,nContacts);
+								data->m_sortDataBuffer->read(sortData.m_ptr,nContacts);
+								adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
+
+								adl::RadixSort<adl::TYPE_HOST>::Data* sData = adl::RadixSort<adl::TYPE_HOST>::allocate( m_internalData->m_deviceHost, nContacts );
+								adl::RadixSort<adl::TYPE_HOST>::execute( sData, sortData, nContacts );
+								adl::RadixSort<adl::TYPE_HOST>::deallocate( sData );
+
+								data->m_sortDataBuffer->write(sortData.m_ptr,nContacts);
+								adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
+							}
+
+
+
+							bool gpuBoundSearch=true;
+							if (gpuBoundSearch)
+							{	//	4. find entries
+								BT_PROFILE("gpuBoundSearch");
+								adl::BoundSearch<adl::TYPE_CL>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, 
+									adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT, adl::BoundSearchBase::COUNT );
+
+								adl::PrefixScan<adl::TYPE_CL>::execute( data->m_scan, *countsNative, *offsetsNative, 
+									adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+							} else
+							{
+								BT_PROFILE("cpuBoundSearch");
+								adl::HostBuffer<adl::SortData> sortData(m_internalData->m_deviceHost,nContacts);
+								data->m_sortDataBuffer->read(sortData.m_ptr,nContacts);
+								adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
+
+								adl::HostBuffer<u32> n0( m_internalData->m_deviceHost, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+								adl::HostBuffer<u32> offset0( m_internalData->m_deviceHost, adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT );
+								for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+								{
+									n0[i] = 0;
+									offset0[i] = 0;
+								}
+
+								for(int i=0; i<nContacts; i++)
+								{
+									int idx = sortData[i].m_key;
+									assert(idx>=0);
+									assert(idx<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+									n0[idx]++;
+								}
+
+								//	scan
+								int sum = 0;
+								for(int i=0; i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT; i++)
+								{
+									offset0[i] = sum;
+									sum += n0[i];
+								}
+
+								countsNative->write(n0.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+								offsetsNative->write(offset0.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+								adl::DeviceUtils::waitForCompletion( data->m_device );
+
+							}
+							{	//	5. sort constraints by cellIdx
+								{
+									BT_PROFILE("gpu m_reorderContactKernel");
+									adl::Buffer<int4> constBuffer( data->m_device, 1, adl::BufferBase::BUFFER_CONST );
+
+									int4 cdata; cdata.x = nContacts;
+									adl::Launcher::BufferInfo bInfo[] = { adl::Launcher::BufferInfo( contactNative ), adl::Launcher::BufferInfo( data->m_contactBuffer ), adl::Launcher::BufferInfo( data->m_sortDataBuffer ) };
+									adl::Launcher launcher( data->m_device, data->m_reorderContactKernel );
+									launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(adl::Launcher::BufferInfo) );
+									launcher.setConst( constBuffer, cdata );
+									launcher.launch1D( nContacts, 64 );
+								}
+							}
+
+						}
+
+						adl::BufferUtils::unmap<false>( bodyNative, bodyBuf );
+						adl::BufferUtils::unmap<false>( contactNative, contactsIn );
+
+					}
+
+					adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL);
+
+					{
+						BT_PROFILE("gpu m_copyConstraintKernel");
+						adl::Buffer<int4> constBuffer( data->m_device, 1, adl::BufferBase::BUFFER_CONST );
+						int4 cdata; cdata.x = nContacts;
+						adl::Launcher::BufferInfo bInfo[] = { adl::Launcher::BufferInfo( data->m_contactBuffer ), adl::Launcher::BufferInfo( contactNative ) };
+						adl::Launcher launcher( data->m_device, data->m_copyConstraintKernel );
+						launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(adl::Launcher::BufferInfo) );
+						launcher.setConst( constBuffer, cdata );
+						launcher.launch1D( nContacts, 64 );
+						adl::DeviceUtils::waitForCompletion( data->m_device );
+					}
+					
+					bool compareGPU = false;
+					if (gpuBatchContacts)
+					{
+						BT_PROFILE("gpu batchContacts");
+						adl::Solver<adl::TYPE_CL>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, csCfg.m_staticIdx );
+					}
+					else
+					{
+						BT_PROFILE("cpu batchContacts2");
+						cpuSolverData->m_parallelSolveData = 0;//
+						ParallelSolveData* dataGPU = (ParallelSolveData*)m_internalData->m_solverDataGPU->m_parallelSolveData;
+						adl::Buffer<u32> numConstraints(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+						adl::Buffer<u32> offsets(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+
+						{
+							BT_PROFILE("gpu->cpu read m_numConstraints");
+							dataGPU->m_numConstraints->read(numConstraints.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							dataGPU->m_offsets->read(offsets.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							adl::DeviceUtils::waitForCompletion( data->m_device );
+						}
+
+						adl::Buffer<u32> gpunumConstraints(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+						adl::Buffer<u32> gpuoffsets(cpuSolverData->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+
+						if (compareGPU)
+						{
+							adl::Buffer<Contact4> contactNativeCopy (data->m_device,contactNative->getSize());
+							contactNativeCopy.write(*contactNative,contactNative->getSize());
+							adl::DeviceUtils::waitForCompletion( data->m_device );
+
+							adl::Buffer<u32> tmpNumGPU(data->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							adl::Buffer<u32> tmpOffsetGPU(data->m_device,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							tmpNumGPU.write(numConstraints.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							tmpOffsetGPU.write(offsets.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							adl::DeviceUtils::waitForCompletion( data->m_device );
+
+							BT_PROFILE("gpu batchContacts");
+							//adl::Solver<adl::TYPE_CL>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, csCfg.m_staticIdx );
+							adl::Solver<adl::TYPE_CL>::batchContacts( data, &contactNativeCopy, nContacts, &tmpNumGPU, &tmpOffsetGPU, csCfg.m_staticIdx );
+
+
+							adl::DeviceUtils::waitForCompletion( data->m_device );
+
+							//compare now
+							tmpNumGPU.read(gpunumConstraints,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							tmpOffsetGPU.read(gpuoffsets,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							adl::DeviceUtils::waitForCompletion( data->m_device );
+
+						}
+
+						CPUSolveData* dataCPU = (CPUSolveData*)cpuSolverData->m_parallelSolveData;
+
+						{
+							BT_PROFILE("cpu batchContacts2");
+							batchContacts2( cpuSolverData, contactNative, nContacts, &numConstraints, &offsets, csCfg.m_staticIdx );
+						}
+
+
+						if (compareGPU)
+						{
+							adl::DeviceUtils::waitForCompletion( data->m_device );
+							dataGPU->m_numConstraints->write(numConstraints.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							dataGPU->m_offsets->write(offsets.m_ptr,adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT);
+							adl::DeviceUtils::waitForCompletion( data->m_device );
+
+
+							for (int i=0;i<adl::SolverBase::N_SPLIT*adl::SolverBase::N_SPLIT;i++)
+							{
+								if (gpunumConstraints.m_ptr[i] != numConstraints.m_ptr[i])
+								{
+									printf("numConstraints error at %d, expected %d got %d\n",i,numConstraints.m_ptr[i],gpunumConstraints.m_ptr[i]);
+								}
+
+								if (gpuoffsets.m_ptr[i] != offsets.m_ptr[i])
+								{
+									printf("numConstraints error at %d, expected %d got %d\n",i,offsets.m_ptr[i],gpuoffsets.m_ptr[i]);
+								}
+
+							}
+
+						}
+
+					}
+					if (1)
+					{
+						BT_PROFILE("gpu convertToConstraints");
+						adl::Solver<adl::TYPE_CL>::convertToConstraints( data, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, csCfg );
+						adl::DeviceUtils::waitForCompletion( data->m_device );
+					}
+					if (compareGPU)
+					{
+						adl::Buffer<Contact4> contactNativeCPU(cpuSolverData->m_device,contactNative->getSize());
+						contactNative->read(contactNativeCPU,nContacts);
+						adl::DeviceUtils::waitForCompletion( data->m_device );
+						for (int i=0;i<nContacts;i++)
+						{
+							//if (contactNativeCopyCPU.m_ptr[i].m_frictionCoeffCmp !=45874)// contactNativeCPU.m_ptr[i].m_batchIdx != contactNativeCopyCPU.m_ptr[i].m_batchIdx)
+							{
+								//if (.m_friction!=45874
+								//printf("not matching at %d, expected %d, got %d\n",i,contactNativeCPU.m_ptr[i].m_batchIdx,contactNativeCopyCPU.m_ptr[i].m_batchIdx);
+							}
+						}
+					}
+
+				}
+			}
+
+		} else
+		{
+			BT_PROFILE("GPU reorderConvertToConstraints");
+			adl::Solver<adl::TYPE_CL>::reorderConvertToConstraints( 
+				m_internalData->m_solverDataGPU, 
+				m_internalData->m_bodyBufferGPU, 
+				m_internalData->m_inertiaBufferGPU, 
+				m_internalData->m_pBufContactOutGPU,
+				m_internalData->m_contactCGPU, 
+				m_internalData->m_frictionCGPU, 
+				nContactOut, 
+				csCfg );
+			adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL );
+		}
+
+
+		if (1)
+		{
+			BT_PROFILE("GPU solveContactConstraint");
+			m_internalData->m_solverDataGPU->m_nIterations = 5;
+
+			adl::Solver<adl::TYPE_CL>::solveContactConstraint( m_internalData->m_solverDataGPU, 
+				m_internalData->m_bodyBufferGPU, 
+				m_internalData->m_inertiaBufferGPU, 
+				m_internalData->m_contactCGPU,
+				0, 
+				nContactOut );
+
+			adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL );
+		}
+
+		if (cpuSolverData)
+			adl::Solver<adl::TYPE_HOST>::deallocate( cpuSolverData );
+
+		if (0)
+		{
+			BT_PROFILE("read body velocities back to CPU");
+			//read body updated linear/angular velocities back to CPU
+			m_internalData->m_bodyBufferGPU->read(
+				m_internalData->m_bodyBufferCPU->m_ptr,numOfConvexRBodies);
+			adl::DeviceUtils::waitForCompletion( m_internalData->m_deviceCL );
+		}
+	}
+
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef GPU_NARROWPHASE_SOLVER_H
+#define GPU_NARROWPHASE_SOLVER_H
+
+
+
+//#define MAX_CONVEX_BODIES_CL 8*1024
+#define MAX_CONVEX_BODIES_CL 128*1024
+#define MAX_PAIRS_PER_BODY_CL 16
+#define MAX_CONVEX_SHAPES_CL 8192
+#define MAX_BROADPHASE_COLLISION_CL (MAX_CONVEX_BODIES_CL*MAX_PAIRS_PER_BODY_CL)
+
+/*
+#define MAX_CONVEX_BODIES_CL 1024
+#define MAX_PAIRS_PER_BODY_CL 32
+#define MAX_CONVEX_SHAPES_CL 8192
+#define MAX_BROADPHASE_COLLISION_CL (MAX_CONVEX_BODIES_CL*MAX_PAIRS_PER_BODY_CL)
+*/
+
+namespace adl
+{
+	struct DeviceCL;
+};
+
+
+struct	CustomDispatchData;
+
+#include "../basic_initialize/btOpenCLInclude.h"
+
+
+class btGpuNarrowphaseAndSolver
+{
+protected:
+
+	CustomDispatchData*	m_internalData;
+	int m_acceleratedCompanionShapeIndex;
+	int m_planeBodyIndex;
+
+public:
+	btGpuNarrowphaseAndSolver(adl::DeviceCL* deviceCL);
+
+	virtual ~btGpuNarrowphaseAndSolver(void);
+
+	int registerShape(class ConvexHeightField* convexShape);
+	int registerRigidBody(int shapeIndex, float mass, const float* position, const float* orientation, bool writeToGpu = true);
+	void	writeAllBodiesToGpu();
+	
+	//btBroadphasePair* GetPair(btBroadphasePairArray& pairArray, int idxBodyA, int idxBodyB);
+
+	virtual void computeContactsAndSolver(cl_mem broadphasePairs, int numBroadphasePairs);
+
+	cl_mem	getBodiesGpu();
+
+	cl_mem	getBodyInertiasGpu();
+
+};
+
+#endif //GPU_NARROWPHASE_SOLVER_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/main.cpp
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline/premake4.lua
@@ -0,0 +1,5 @@
+
+	include "AMD"
+--	include "Intel"
+	include "NVIDIA"
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/AMD/premake4.lua
@@ -0,0 +1,64 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_gpu_rigidbody_pipeline2_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+
+		initOpenGL()
+		initGlew()
+
+		includedirs {
+		"../../primitives",
+		"../../../../../src"
+		}
+		
+		files {
+			"../main.cpp",
+			"../CLPhysicsDemo.cpp",
+			"../CLPhysicsDemo.h",
+			"../GLInstancingRenderer.cpp",
+			"../GLInstancingRenderer.h",
+			"../GlutRenderer.cpp",
+			"../GlutRenderer.h",
+			"../Win32OpenGLRenderManager.cpp",
+			"../Win32OpenGLRenderManager.h",	
+			"../../gpu_rigidbody_pipeline/btConvexUtility.cpp",
+			"../../gpu_rigidbody_pipeline/btConvexUtility.h",
+			"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.cpp",
+			"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
+			"../../../../../src/LinearMath/btConvexHullComputer.cpp",
+			"../../../../../src/LinearMath/btConvexHullComputer.h",
+			"../../broadphase_benchmark/findPairsOpenCL.cpp",
+			"../../broadphase_benchmark/findPairsOpenCL.h",
+			"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
+			"../../broadphase_benchmark/btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../../../src/LinearMath/btAlignedAllocator.cpp",
+			"../../../../../src/LinearMath/btQuickprof.cpp",
+			"../../../../../src/LinearMath/btQuickprof.h",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../../../src/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/CLPhysicsDemo.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/CLPhysicsDemo.cpp
@@ -0,0 +1,529 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "OpenGLInclude.h"
+
+#include "CLPhysicsDemo.h"
+#include "LinearMath/btAlignedObjectArray.h"
+#include "DemoSettings.h"
+#include "../basic_initialize/btOpenCLUtils.h"
+#include "../opengl_interop/btOpenCLGLInteropBuffer.h"
+#include "../broadphase_benchmark/findPairsOpenCL.h"
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btQuaternion.h"
+#include "LinearMath/btMatrix3x3.h"
+#include "../../opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h"
+#include "../../opencl/gpu_rigidbody_pipeline/btConvexUtility.h"
+#include "../../dynamics/basic_demo/ConvexHeightFieldShape.h"
+#include "../broadphase_benchmark/btGridBroadphaseCl.h"
+#include "LinearMath/btQuickprof.h"
+
+
+#define MSTRINGIFY(A) #A
+static char* interopKernelString = 
+#include "../broadphase_benchmark/integrateKernel.cl"
+
+#define INTEROPKERNEL_SRC_PATH "../../opencl/broadphase_benchmark/integrateKernel.cl"
+	
+cl_kernel g_integrateTransformsKernel;
+
+
+
+bool runOpenCLKernels = true;
+
+
+btGpuNarrowphaseAndSolver* narrowphaseAndSolver = 0;
+ConvexHeightField* s_convexHeightField = 0 ;
+btOpenCLGLInteropBuffer* g_interopBuffer = 0;
+
+extern GLuint               cube_vbo;
+extern int VBOsize;
+
+cl_mem clBuffer=0;
+char* hostPtr=0;
+cl_bool blocking=  CL_TRUE;
+
+
+
+btFindPairsIO gFpIO;
+
+cl_context			g_cxMainContext;
+cl_command_queue	g_cqCommandQue;
+cl_device_id		g_device;
+
+cl_mem				gLinVelMem=0;
+cl_mem				gAngVelMem=0;
+cl_mem				gBodyTimes=0;
+
+#include <Adl/Adl.h>
+
+adl::DeviceCL* g_deviceCL=0;
+
+struct  btAABBHost //keep this in sync with btAABBCL!
+{
+	float			fx;
+	float			fy;
+	float			fz;
+	unsigned int	uw;
+};
+
+struct InternalData
+{
+	adl::Buffer<btVector3>* m_linVelBuf;
+	adl::Buffer<btVector3>* m_angVelBuf;
+	adl::Buffer<float>* m_bodyTimes;
+	bool	m_useInterop;
+	btGridBroadphaseCl* m_Broadphase;
+
+	adl::Buffer<btAABBHost>* m_localShapeAABB;
+
+	btVector3*	m_linVelHost;
+	btVector3*	m_angVelHost;
+	float*		m_bodyTimesHost;
+
+	InternalData():m_linVelBuf(0),m_angVelBuf(0),m_bodyTimes(0),m_useInterop(0),m_Broadphase(0)
+	{
+		m_linVelHost= new btVector3[MAX_CONVEX_BODIES_CL];
+		m_angVelHost = new btVector3[MAX_CONVEX_BODIES_CL];
+		m_bodyTimesHost = new float[MAX_CONVEX_BODIES_CL];
+	}
+	~InternalData()
+	{
+		delete[] m_linVelHost;
+		delete[] m_angVelHost;
+		delete[] m_bodyTimesHost;
+
+	}
+};
+
+
+void InitCL(int preferredDeviceIndex, int preferredPlatformIndex, bool useInterop)
+{
+	void* glCtx=0;
+	void* glDC = 0;
+
+#ifdef _WIN32
+	glCtx = wglGetCurrentContext();
+#else //!_WIN32
+	GLXContext glCtx = glXGetCurrentContext();
+#endif //!_WIN32
+	glDC = wglGetCurrentDC();
+
+	int ciErrNum = 0;
+#ifdef CL_PLATFORM_INTEL
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+#else
+	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+#endif
+
+	
+
+	if (useInterop)
+	{
+		g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
+	} else
+	{
+		g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
+	}
+
+
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
+
+	if (numDev>0)
+	{
+		g_device= btOpenCLUtils::getDevice(g_cxMainContext,0);
+		btOpenCLDeviceInfo clInfo;
+		btOpenCLUtils::getDeviceInfo(g_device,clInfo);
+		btOpenCLUtils::printDeviceInfo(g_device);
+		g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, g_device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+
+}
+
+
+
+
+CLPhysicsDemo::CLPhysicsDemo(Win32OpenGLWindow*	renderer)
+{
+	m_numCollisionShapes=0;
+	m_numPhysicsInstances=0;
+
+	m_data = new InternalData;
+}
+
+CLPhysicsDemo::~CLPhysicsDemo()
+{
+
+}
+
+
+void CLPhysicsDemo::writeBodiesToGpu()
+{
+	if (narrowphaseAndSolver)
+		narrowphaseAndSolver->writeAllBodiesToGpu();
+}
+
+int		CLPhysicsDemo::registerCollisionShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling)
+{
+	btAlignedObjectArray<btVector3> verts;
+	
+	unsigned char* vts = (unsigned char*) vertices;
+	for (int i=0;i<numVertices;i++)
+	{
+		float* vertex = (float*) &vts[i*strideInBytes];
+		verts.push_back(btVector3(vertex[0]*scaling[0],vertex[1]*scaling[1],vertex[2]*scaling[2]));
+	}
+
+	btConvexUtility util;
+	bool merge = true;
+	util.initializePolyhedralFeatures(verts,merge);
+
+	int numFaces= util.m_faces.size();
+	float4* eqn = new float4[numFaces];
+	for (int i=0;i<numFaces;i++)
+	{
+		eqn[i].x = util.m_faces[i].m_plane[0];
+		eqn[i].y = util.m_faces[i].m_plane[1];
+		eqn[i].z = util.m_faces[i].m_plane[2];
+		eqn[i].w = util.m_faces[i].m_plane[3];
+	}
+	printf("numFaces = %d\n", numFaces);
+
+
+	s_convexHeightField = new ConvexHeightField(eqn,numFaces);
+
+	int shapeIndex=-1;
+
+	if (narrowphaseAndSolver)
+		shapeIndex = narrowphaseAndSolver->registerShape(s_convexHeightField);
+
+	if (shapeIndex>=0)
+	{
+		btAABBHost aabbMin, aabbMax;
+		aabbMin.fx = s_convexHeightField->m_aabb.m_min.x;
+		aabbMin.fy = s_convexHeightField->m_aabb.m_min.y;
+		aabbMin.fz= s_convexHeightField->m_aabb.m_min.z;
+		aabbMin.uw = shapeIndex;
+
+		aabbMax.fx = s_convexHeightField->m_aabb.m_max.x;
+		aabbMax.fy = s_convexHeightField->m_aabb.m_max.y;
+		aabbMax.fz= s_convexHeightField->m_aabb.m_max.z;
+		aabbMax.uw = shapeIndex;
+
+		m_data->m_localShapeAABB->write(&aabbMin,1,shapeIndex*2);
+		m_data->m_localShapeAABB->write(&aabbMax,1,shapeIndex*2+1);
+		adl::DeviceUtils::waitForCompletion( g_deviceCL );
+	}
+
+	m_numCollisionShapes++;
+	delete[] eqn;
+	return shapeIndex;
+}
+
+int		CLPhysicsDemo::registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, void* userPointer)
+{
+	btVector3 aabbMin(position[0],position[0],position[0]);
+	btVector3 aabbMax = aabbMin;
+	aabbMin -= btVector3(1.f,1.f,1.f);
+	aabbMax += btVector3(1.f,1.f,1.f);
+
+	if (collisionShapeIndex>=0)
+	{
+		btBroadphaseProxy* proxy = m_data->m_Broadphase->createProxy(aabbMin,aabbMax,collisionShapeIndex,userPointer,1,1,0,0);//m_dispatcher);
+	}
+			
+	bool writeToGpu = false;
+	int bodyIndex = -1;
+
+	if (narrowphaseAndSolver)
+		bodyIndex = narrowphaseAndSolver->registerRigidBody(collisionShapeIndex,mass,position,orientation,writeToGpu);
+
+	m_numPhysicsInstances++;
+	return bodyIndex;
+}
+
+
+
+void	CLPhysicsDemo::init(int preferredDevice, int preferredPlatform, bool useInterop)
+{
+	
+	InitCL(-1,-1,useInterop);
+
+#define CUSTOM_CL_INITIALIZATION
+#ifdef CUSTOM_CL_INITIALIZATION
+	g_deviceCL = new adl::DeviceCL();
+	g_deviceCL->m_deviceIdx = g_device;
+	g_deviceCL->m_context = g_cxMainContext;
+	g_deviceCL->m_commandQueue = g_cqCommandQue;
+	g_deviceCL->m_kernelManager = new adl::KernelManager;
+
+#else
+	DeviceUtils::Config cfg;
+	cfg.m_type = DeviceUtils::Config::DEVICE_CPU;
+	g_deviceCL = DeviceUtils::allocate( TYPE_CL, cfg );
+#endif
+
+	//adl::Solver<adl::TYPE_CL>::allocate(g_deviceCL->allocate(
+	m_data->m_linVelBuf = new adl::Buffer<btVector3>(g_deviceCL,MAX_CONVEX_BODIES_CL);
+	m_data->m_angVelBuf = new adl::Buffer<btVector3>(g_deviceCL,MAX_CONVEX_BODIES_CL);
+	m_data->m_bodyTimes = new adl::Buffer<float>(g_deviceCL,MAX_CONVEX_BODIES_CL);
+
+	m_data->m_localShapeAABB = new adl::Buffer<btAABBHost>(g_deviceCL,MAX_CONVEX_SHAPES_CL);
+	
+	gLinVelMem = (cl_mem)m_data->m_linVelBuf->m_ptr;
+	gAngVelMem = (cl_mem)m_data->m_angVelBuf->m_ptr;
+	gBodyTimes = (cl_mem)m_data->m_bodyTimes->m_ptr;
+
+	
+
+
+	narrowphaseAndSolver = new btGpuNarrowphaseAndSolver(g_deviceCL);
+
+	
+	
+	int maxObjects = btMax(256,MAX_CONVEX_BODIES_CL);
+	int maxPairsSmallProxy = 32;
+	btOverlappingPairCache* overlappingPairCache=0;
+
+	m_data->m_Broadphase = new btGridBroadphaseCl(overlappingPairCache,btVector3(4.f, 4.f, 4.f), 128, 128, 128,maxObjects, maxObjects, maxPairsSmallProxy, 100.f, 128,
+		g_cxMainContext ,g_device,g_cqCommandQue, g_deviceCL);
+
+	
+
+	cl_program prog = btOpenCLUtils::compileCLProgramFromString(g_cxMainContext,g_device,interopKernelString,0,"",INTEROPKERNEL_SRC_PATH);
+	g_integrateTransformsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext, g_device,interopKernelString, "integrateTransformsKernel" ,0,prog);
+	
+
+	initFindPairs(gFpIO, g_cxMainContext, g_device, g_cqCommandQue, MAX_CONVEX_BODIES_CL);
+
+	
+
+
+}
+	
+
+
+void CLPhysicsDemo::writeVelocitiesToGpu()
+{
+	m_data->m_linVelBuf->write(m_data->m_linVelHost,MAX_CONVEX_BODIES_CL);
+	m_data->m_angVelBuf->write(m_data->m_angVelHost,MAX_CONVEX_BODIES_CL);
+	m_data->m_bodyTimes->write(m_data->m_bodyTimesHost,MAX_CONVEX_BODIES_CL);
+	adl::DeviceUtils::waitForCompletion( g_deviceCL );
+}
+
+
+void CLPhysicsDemo::setupInterop()
+{
+	m_data->m_useInterop = true;
+
+	g_interopBuffer = new btOpenCLGLInteropBuffer(g_cxMainContext,g_cqCommandQue,cube_vbo);
+	clFinish(g_cqCommandQue);
+}
+
+void	CLPhysicsDemo::cleanup()
+{
+	delete narrowphaseAndSolver;
+
+	delete m_data->m_linVelBuf;
+	delete m_data->m_angVelBuf;
+	delete m_data->m_bodyTimes;
+	delete m_data->m_localShapeAABB;
+
+	delete m_data->m_Broadphase;
+	delete m_data;
+
+	delete g_deviceCL->m_kernelManager;
+	delete g_deviceCL;
+
+	m_data=0;
+	g_deviceCL=0;
+	delete g_interopBuffer;
+	delete s_convexHeightField;
+}
+
+
+
+
+
+void	CLPhysicsDemo::stepSimulation()
+{
+	BT_PROFILE("simulationLoop");
+	
+	{
+		BT_PROFILE("glFinish");
+		glFinish();
+	}
+	cl_int ciErrNum = CL_SUCCESS;
+
+
+	if(m_data->m_useInterop)
+	{
+		clBuffer = g_interopBuffer->getCLBUffer();
+		BT_PROFILE("clEnqueueAcquireGLObjects");
+		ciErrNum = clEnqueueAcquireGLObjects(g_cqCommandQue, 1, &clBuffer, 0, 0, NULL);
+		adl::DeviceUtils::waitForCompletion( g_deviceCL );
+	} else
+	{
+
+		glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
+		glFlush();
+
+		BT_PROFILE("glMapBuffer and clEnqueueWriteBuffer");
+
+		blocking=  CL_TRUE;
+		hostPtr=  (char*)glMapBuffer( GL_ARRAY_BUFFER,GL_READ_WRITE);//GL_WRITE_ONLY
+		if (!clBuffer)
+		{
+			clBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, VBOsize, 0, &ciErrNum);
+		} 
+		adl::DeviceUtils::waitForCompletion( g_deviceCL );
+			oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+		ciErrNum = clEnqueueWriteBuffer (	g_cqCommandQue,
+ 			clBuffer,
+ 			blocking,
+ 			0,
+ 			VBOsize,
+ 			hostPtr,0,0,0
+		);
+		adl::DeviceUtils::waitForCompletion( g_deviceCL );
+	}
+
+
+
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	if (runOpenCLKernels && m_numPhysicsInstances)
+	{
+
+		gFpIO.m_numObjects = m_numPhysicsInstances;
+		gFpIO.m_positionOffset = SHAPE_VERTEX_BUFFER_SIZE/4;
+		gFpIO.m_clObjectsBuffer = clBuffer;
+		gFpIO.m_dAABB = m_data->m_Broadphase->m_dAABB;
+		gFpIO.m_dlocalShapeAABB = (cl_mem)m_data->m_localShapeAABB->m_ptr;
+		gFpIO.m_numOverlap = 0;
+		{
+			BT_PROFILE("setupGpuAabbs");
+			setupGpuAabbsFull(gFpIO,narrowphaseAndSolver->getBodiesGpu() );
+		}
+		if (1)
+		{
+			BT_PROFILE("calculateOverlappingPairs");
+			m_data->m_Broadphase->calculateOverlappingPairs(0, m_numPhysicsInstances);
+			gFpIO.m_dAllOverlappingPairs = m_data->m_Broadphase->m_dAllOverlappingPairs;
+			gFpIO.m_numOverlap = m_data->m_Broadphase->m_numPrefixSum;
+		}
+		
+		//printf("gFpIO.m_numOverlap = %d\n",gFpIO.m_numOverlap );
+		if (gFpIO.m_numOverlap>=0 && gFpIO.m_numOverlap<MAX_BROADPHASE_COLLISION_CL)
+		{
+			colorPairsOpenCL(gFpIO);
+
+			if (1)
+			{
+				{
+					//BT_PROFILE("setupBodies");
+					if (narrowphaseAndSolver)
+						setupBodies(gFpIO, gLinVelMem, gAngVelMem, narrowphaseAndSolver->getBodiesGpu(), narrowphaseAndSolver->getBodyInertiasGpu());
+				}
+				if (gFpIO.m_numOverlap)
+				{
+					BT_PROFILE("computeContactsAndSolver");
+					if (narrowphaseAndSolver)
+						narrowphaseAndSolver->computeContactsAndSolver(gFpIO.m_dAllOverlappingPairs,gFpIO.m_numOverlap);
+				}
+
+				{
+					BT_PROFILE("copyBodyVelocities");
+					if (narrowphaseAndSolver)
+						copyBodyVelocities(gFpIO, gLinVelMem, gAngVelMem, narrowphaseAndSolver->getBodiesGpu(), narrowphaseAndSolver->getBodyInertiasGpu());
+				}
+			}
+
+		} else
+		{
+			printf("error, gFpIO.m_numOverlap = %d\n",gFpIO.m_numOverlap);
+			btAssert(0);
+		}
+
+
+		{
+			BT_PROFILE("integrateTransforms");
+
+			if (runOpenCLKernels)
+			{
+				int numObjects = m_numPhysicsInstances;
+				int offset = SHAPE_VERTEX_BUFFER_SIZE/4;
+
+				ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 0, sizeof(int), &offset);
+				ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 1, sizeof(int), &numObjects);
+				ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 2, sizeof(cl_mem), (void*)&clBuffer );
+
+				ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 3, sizeof(cl_mem), (void*)&gLinVelMem);
+				ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 4, sizeof(cl_mem), (void*)&gAngVelMem);
+				ciErrNum = clSetKernelArg(g_integrateTransformsKernel, 5, sizeof(cl_mem), (void*)&gBodyTimes);
+					
+					
+					
+
+				size_t workGroupSize = 64;
+				size_t	numWorkItems = workGroupSize*((m_numPhysicsInstances + (workGroupSize)) / workGroupSize);
+				
+				if (workGroupSize>numWorkItems)
+					workGroupSize=numWorkItems;
+
+				ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_integrateTransformsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
+				oclCHECKERROR(ciErrNum, CL_SUCCESS);
+			}
+		}
+			
+
+	}
+
+	if(m_data->m_useInterop)
+	{
+		BT_PROFILE("clEnqueueReleaseGLObjects");
+		ciErrNum = clEnqueueReleaseGLObjects(g_cqCommandQue, 1, &clBuffer, 0, 0, 0);
+		adl::DeviceUtils::waitForCompletion( g_deviceCL );
+	}
+	else
+	{
+		BT_PROFILE("clEnqueueReadBuffer clReleaseMemObject and glUnmapBuffer");
+		ciErrNum = clEnqueueReadBuffer (	g_cqCommandQue,
+ 		clBuffer,
+ 		blocking,
+ 		0,
+ 		VBOsize,
+ 		hostPtr,0,0,0);
+
+		//clReleaseMemObject(clBuffer);
+		adl::DeviceUtils::waitForCompletion( g_deviceCL );
+		glUnmapBuffer( GL_ARRAY_BUFFER);
+		glFlush();
+	}
+
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+
+	if (runOpenCLKernels)
+	{
+		BT_PROFILE("clFinish");
+		clFinish(g_cqCommandQue);
+	}
+
+	
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/CLPhysicsDemo.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/CLPhysicsDemo.h
@@ -0,0 +1,53 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef CL_PHYSICS_DEMO_H
+#define CL_PHYSICS_DEMO_H
+
+class Win32OpenGLWindow;
+
+struct CLPhysicsDemo
+{
+	Win32OpenGLWindow* m_renderer;
+
+	int m_numCollisionShapes;
+
+	int m_numPhysicsInstances;
+
+	struct InternalData* m_data;
+	
+	CLPhysicsDemo(Win32OpenGLWindow*	renderer);
+	
+	virtual ~CLPhysicsDemo();
+
+	//btOpenCLGLInteropBuffer*	m_interopBuffer;
+	
+	void	init(int preferredDevice, int preferredPlatform, bool useInterop);
+	
+	void	setupInterop();
+
+	int		registerCollisionShape(const float* vertices, int strideInBytes, int numVertices, const float* scaling);
+
+	int		registerPhysicsInstance(float mass, const float* position, const float* orientation, int collisionShapeIndex, void* userPointer);
+
+	void	writeVelocitiesToGpu();
+	void	writeBodiesToGpu();
+
+	void	cleanup();
+
+	void	stepSimulation();
+};
+
+#endif//CL_PHYSICS_DEMO_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/DemoSettings.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/DemoSettings.h
@@ -0,0 +1,24 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef DEMO_SETTINGS_H
+#define DEMO_SETTINGS_H
+
+#define SHAPE_VERTEX_BUFFER_SIZE 1024*1024
+
+#define SHAPE_BUFFER_SIZE (SHAPE_VERTEX_BUFFER_SIZE)
+
+
+#endif //DEMO_SETTINGS_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GLInstancingRenderer.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GLInstancingRenderer.cpp
@@ -0,0 +1,861 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#include "OpenGLInclude.h"
+#include "GLInstancingRenderer.h"
+
+#include <string.h>
+#include "DemoSettings.h"
+#include <stdio.h>
+#include <assert.h>
+#include "LinearMath/btVector3.h"
+#include "LinearMath/btQuaternion.h"
+#include "LinearMath/btQuickprof.h"
+#include "LinearMath/btMatrix3x3.h"
+
+#include "../../opencl/gpu_rigidbody_pipeline/btGpuNarrowphaseAndSolver.h"//for MAX_CONVEX_BODIES_CL
+
+struct btGraphicsInstance
+{
+	GLuint               m_cube_vao;
+	GLuint               m_index_vbo;
+	int m_numIndices;
+	int m_numVertices;
+
+	int m_numGraphicsInstances;
+	
+	int m_instanceOffset;
+	int m_vertexArrayOffset;
+
+	btGraphicsInstance() :m_cube_vao(-1),m_index_vbo(-1),m_numIndices(-1),m_numVertices(-1),m_numGraphicsInstances(0),m_instanceOffset(0),m_vertexArrayOffset(0)
+	{
+	}
+
+};
+
+
+
+bool m_ortho = false;
+int m_glutScreenWidth = 1024;
+int m_glutScreenHeight = 768;
+
+
+
+extern int gShapeIndex;
+
+
+btVector3 m_cameraPosition(0,0,0);//will be overridden by a position computed from azi/ele
+btVector3 m_cameraTargetPosition(30,-5,-20);
+btScalar m_cameraDistance = 95;
+btVector3 m_cameraUp(0,1,0);
+float m_azi=95.f;
+float m_ele=15.f;
+
+
+
+
+int VBOsize =0;
+
+
+
+struct InternalDataRenderer
+{
+	GLfloat* m_instance_positions_ptr;
+	GLfloat* m_instance_quaternion_ptr;
+	GLfloat* m_instance_colors_ptr;
+	GLfloat* m_instance_scale_ptr;
+
+	InternalDataRenderer() :m_instance_positions_ptr (0),m_instance_quaternion_ptr(0),m_instance_colors_ptr(0),m_instance_scale_ptr(0)
+	{
+	}
+
+};
+
+static GLuint               instancingShader;        // The instancing renderer
+
+GLuint               cube_vbo;
+
+static GLuint				m_texturehandle;
+
+static bool                 done = false;
+static GLint                angle_loc = 0;
+static GLint ModelViewMatrix;
+static GLint ProjectionMatrix;
+
+
+
+GLInstancingRenderer::GLInstancingRenderer()
+{
+
+	m_data = new InternalDataRenderer;
+
+	m_data->m_instance_positions_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*4];
+	m_data->m_instance_quaternion_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*4];
+	m_data->m_instance_colors_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*4];
+	m_data->m_instance_scale_ptr = (GLfloat*)new float[MAX_CONVEX_BODIES_CL*3];
+
+}
+
+GLInstancingRenderer::~GLInstancingRenderer()
+{
+	delete m_data;
+}
+
+
+static GLint                uniform_texture_diffuse = 0;
+
+//used for dynamic loading from disk (default switched off)
+#define MAX_SHADER_LENGTH   8192
+static GLubyte shaderText[MAX_SHADER_LENGTH];
+
+static const char* vertexShader= \
+"#version 330\n"
+"precision highp float;\n"
+"\n"
+"\n"
+"\n"
+"layout (location = 0) in vec4 position;\n"
+"layout (location = 1) in vec4 instance_position;\n"
+"layout (location = 2) in vec4 instance_quaternion;\n"
+"layout (location = 3) in vec2 uvcoords;\n"
+"layout (location = 4) in vec3 vertexnormal;\n"
+"layout (location = 5) in vec4 instance_color;\n"
+"layout (location = 6) in vec3 instance_scale;\n"
+"\n"
+"\n"
+"uniform float angle = 0.0;\n"
+"uniform mat4 ModelViewMatrix;\n"
+"uniform mat4 ProjectionMatrix;\n"
+"\n"
+"out Fragment\n"
+"{\n"
+"     vec4 color;\n"
+"} fragment;\n"
+"\n"
+"out Vert\n"
+"{\n"
+"	vec2 texcoord;\n"
+"} vert;\n"
+"\n"
+"\n"
+"vec4 quatMul ( in vec4 q1, in vec4 q2 )\n"
+"{\n"
+"    vec3  im = q1.w * q2.xyz + q1.xyz * q2.w + cross ( q1.xyz, q2.xyz );\n"
+"    vec4  dt = q1 * q2;\n"
+"    float re = dot ( dt, vec4 ( -1.0, -1.0, -1.0, 1.0 ) );\n"
+"    return vec4 ( im, re );\n"
+"}\n"
+"\n"
+"vec4 quatFromAxisAngle(vec4 axis, in float angle)\n"
+"{\n"
+"    float cah = cos(angle*0.5);\n"
+"    float sah = sin(angle*0.5);\n"
+"	float d = inversesqrt(dot(axis,axis));\n"
+"	vec4 q = vec4(axis.x*sah*d,axis.y*sah*d,axis.z*sah*d,cah);\n"
+"	return q;\n"
+"}\n"
+"//\n"
+"// vector rotation via quaternion\n"
+"//\n"
+"vec4 quatRotate3 ( in vec3 p, in vec4 q )\n"
+"{\n"
+"    vec4 temp = quatMul ( q, vec4 ( p, 0.0 ) );\n"
+"    return quatMul ( temp, vec4 ( -q.x, -q.y, -q.z, q.w ) );\n"
+"}\n"
+"vec4 quatRotate ( in vec4 p, in vec4 q )\n"
+"{\n"
+"    vec4 temp = quatMul ( q, p );\n"
+"    return quatMul ( temp, vec4 ( -q.x, -q.y, -q.z, q.w ) );\n"
+"}\n"
+"\n"
+"out vec3 lightDir,normal,ambient;\n"
+"\n"
+"void main(void)\n"
+"{\n"
+"	vec4 q = instance_quaternion;\n"
+"	ambient = vec3(0.3,.3,0.3);\n"
+"		\n"
+"		\n"
+"	vec4 local_normal = (quatRotate3( vertexnormal,q));\n"
+"	vec3 light_pos = vec3(-0.8,1,-0.6);\n"
+"	normal = local_normal.xyz;\n"//normalize(ModelViewMatrix * local_normal).xyz;\n"
+"\n"
+"	lightDir = normalize(light_pos);//gl_LightSource[0].position.xyz));\n"
+"//	lightDir = normalize(vec3(gl_LightSource[0].position));\n"
+"		\n"
+"	vec4 axis = vec4(1,1,1,0);\n"
+"	vec4 localcoord = quatRotate3( position.xyz*instance_scale,q);\n"
+"	vec4 vertexPos = ProjectionMatrix * ModelViewMatrix *(instance_position+localcoord);\n"
+"\n"
+"	gl_Position = vertexPos;\n"
+"	\n"
+"	fragment.color = instance_color;\n"
+"	vert.texcoord = uvcoords;\n"
+"}\n"
+;
+
+
+static const char* fragmentShader= \
+"#version 330\n"
+"precision highp float;\n"
+"\n"
+"in Fragment\n"
+"{\n"
+"     vec4 color;\n"
+"} fragment;\n"
+"\n"
+"in Vert\n"
+"{\n"
+"	vec2 texcoord;\n"
+"} vert;\n"
+"\n"
+"uniform sampler2D Diffuse;\n"
+"\n"
+"in vec3 lightDir,normal,ambient;\n"
+"\n"
+"out vec4 color;\n"
+"\n"
+"void main_textured(void)\n"
+"{\n"
+"    color =  texture2D(Diffuse,vert.texcoord);//fragment.color;\n"
+"}\n"
+"\n"
+"void main(void)\n"
+"{\n"
+"    vec4 texel = fragment.color*texture2D(Diffuse,vert.texcoord);//fragment.color;\n"
+"	vec3 ct,cf;\n"
+"	float intensity,at,af;\n"
+"	intensity = max(dot(lightDir,normalize(normal)),.2);\n"
+"	cf = intensity*vec3(1.0,1.0,1.0)+ambient;"
+"	af = 1.0;\n"
+"		\n"
+"	ct = texel.rgb;\n"
+"	at = texel.a;\n"
+"		\n"
+"	color  = vec4(ct * cf, at * af);	\n"
+"}\n"
+;
+
+
+// Load the shader from the source text
+void gltLoadShaderSrc(const char *szShaderSrc, GLuint shader)
+{
+	GLchar *fsStringPtr[1];
+
+	fsStringPtr[0] = (GLchar *)szShaderSrc;
+	glShaderSource(shader, 1, (const GLchar **)fsStringPtr, NULL);
+}
+
+
+GLuint gltLoadShaderPair(const char *szVertexProg, const char *szFragmentProg)
+{
+	// Temporary Shader objects
+	GLuint hVertexShader;
+	GLuint hFragmentShader; 
+	GLuint hReturn = 0;   
+	GLint testVal;
+
+	// Create shader objects
+	hVertexShader = glCreateShader(GL_VERTEX_SHADER);
+	hFragmentShader = glCreateShader(GL_FRAGMENT_SHADER);
+
+	gltLoadShaderSrc(vertexShader, hVertexShader);
+	gltLoadShaderSrc(fragmentShader, hFragmentShader);
+	
+	// Compile them
+	glCompileShader(hVertexShader);
+	glCompileShader(hFragmentShader);
+
+	// Check for errors
+	glGetShaderiv(hVertexShader, GL_COMPILE_STATUS, &testVal);
+	if(testVal == GL_FALSE)
+	{
+			 char temp[256] = "";
+			glGetShaderInfoLog( hVertexShader, 256, NULL, temp);
+			fprintf( stderr, "Compile failed:\n%s\n", temp);
+			assert(0);
+			exit(0);
+		glDeleteShader(hVertexShader);
+		glDeleteShader(hFragmentShader);
+		return (GLuint)NULL;
+	}
+
+	glGetShaderiv(hFragmentShader, GL_COMPILE_STATUS, &testVal);
+	if(testVal == GL_FALSE)
+	{
+		 char temp[256] = "";
+			glGetShaderInfoLog( hFragmentShader, 256, NULL, temp);
+			fprintf( stderr, "Compile failed:\n%s\n", temp);
+			assert(0);
+			exit(0);
+		glDeleteShader(hVertexShader);
+		glDeleteShader(hFragmentShader);
+		return (GLuint)NULL;
+	}
+
+	// Link them - assuming it works...
+	hReturn = glCreateProgram();
+	glAttachShader(hReturn, hVertexShader);
+	glAttachShader(hReturn, hFragmentShader);
+
+	glLinkProgram(hReturn);
+
+	// These are no longer needed
+	glDeleteShader(hVertexShader);
+	glDeleteShader(hFragmentShader);  
+
+	// Make sure link worked too
+	glGetProgramiv(hReturn, GL_LINK_STATUS, &testVal);
+	if(testVal == GL_FALSE)
+	{
+		glDeleteProgram(hReturn);
+		return (GLuint)NULL;
+	}
+
+	return hReturn;  
+}   
+
+
+void GLInstancingRenderer::writeTransforms()
+{
+	glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
+	glFlush();
+	
+	char* orgBase =  (char*)glMapBuffer( GL_ARRAY_BUFFER,GL_READ_WRITE);
+
+	int totalNumInstances= 0;
+
+	for (int k=0;k<m_graphicsInstances.size();k++)
+	{
+		btGraphicsInstance* gfxObj = m_graphicsInstances[k];
+		totalNumInstances+=gfxObj->m_numGraphicsInstances;
+	}
+
+
+
+	for (int k=0;k<m_graphicsInstances.size();k++)
+	{
+		//int k=0;
+		btGraphicsInstance* gfxObj = m_graphicsInstances[k];
+
+	
+
+		int POSITION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
+		int ORIENTATION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
+		int COLOR_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
+		int SCALE_BUFFER_SIZE = (totalNumInstances*sizeof(float)*3);
+
+		char* base = orgBase;
+
+		float* positions = (float*)(base+SHAPE_BUFFER_SIZE);
+		float* orientations = (float*)(base+SHAPE_BUFFER_SIZE + POSITION_BUFFER_SIZE);
+		float* colors= (float*)(base+SHAPE_BUFFER_SIZE + POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE);
+		float* scaling= (float*)(base+SHAPE_BUFFER_SIZE + POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE+COLOR_BUFFER_SIZE);
+
+		static int offset=0;
+		//offset++;
+
+
+		for (int i=0;i<gfxObj->m_numGraphicsInstances;i++)
+		{
+
+			int srcIndex=i+gfxObj->m_instanceOffset;
+
+			positions[srcIndex*4] = m_data->m_instance_positions_ptr[srcIndex*4];
+			positions[srcIndex*4+1] = m_data->m_instance_positions_ptr[srcIndex*4+1];
+			positions[srcIndex*4+2] = m_data->m_instance_positions_ptr[srcIndex*4+2];
+			positions[srcIndex*4+3] = m_data->m_instance_positions_ptr[srcIndex*4+3];
+
+			orientations[srcIndex*4]=m_data->m_instance_quaternion_ptr[srcIndex*4];
+			orientations[srcIndex*4+1]=m_data->m_instance_quaternion_ptr[srcIndex*4+1];
+			orientations[srcIndex*4+2]=m_data->m_instance_quaternion_ptr[srcIndex*4+2];
+			orientations[srcIndex*4+3]=m_data->m_instance_quaternion_ptr[srcIndex*4+3];
+
+			colors[srcIndex*4]=m_data->m_instance_colors_ptr[srcIndex*4];
+			colors[srcIndex*4+1]=m_data->m_instance_colors_ptr[srcIndex*4+1];
+			colors[srcIndex*4+2]=m_data->m_instance_colors_ptr[srcIndex*4+2];
+			colors[srcIndex*4+3]=m_data->m_instance_colors_ptr[srcIndex*4+3];
+
+			scaling[srcIndex*3]=m_data->m_instance_scale_ptr[srcIndex*3];
+			scaling[srcIndex*3+1]=m_data->m_instance_scale_ptr[srcIndex*3+1];
+			scaling[srcIndex*3+2]=m_data->m_instance_scale_ptr[srcIndex*3+2];
+	
+		}
+	}
+
+	glUnmapBuffer( GL_ARRAY_BUFFER);
+	//if this glFinish is removed, the animation is not always working/blocks
+	//@todo: figure out why
+	glFlush();
+}
+
+int GLInstancingRenderer::registerGraphicsInstance(int shapeIndex, const float* position, const float* quaternion, const float* color, const float* scaling)
+{
+	btGraphicsInstance* gfxObj = m_graphicsInstances[shapeIndex];
+
+	int index = gfxObj->m_numGraphicsInstances + gfxObj->m_instanceOffset;
+	
+
+
+	m_data->m_instance_positions_ptr[index*4]=position[0];
+	m_data->m_instance_positions_ptr[index*4+1]=position[1];
+	m_data->m_instance_positions_ptr[index*4+2]=position[2];
+	m_data->m_instance_positions_ptr[index*4+3]=1;
+
+	m_data->m_instance_quaternion_ptr[index*4]=quaternion[0];
+	m_data->m_instance_quaternion_ptr[index*4+1]=quaternion[1];
+	m_data->m_instance_quaternion_ptr[index*4+2]=quaternion[2];
+	m_data->m_instance_quaternion_ptr[index*4+3]=quaternion[3];
+
+	m_data->m_instance_colors_ptr[index*4]=color[0];
+	m_data->m_instance_colors_ptr[index*4+1]=color[1];
+	m_data->m_instance_colors_ptr[index*4+2]=color[2];
+	m_data->m_instance_colors_ptr[index*4+3]=color[3];
+
+	m_data->m_instance_scale_ptr[index*3] = scaling[0];
+	m_data->m_instance_scale_ptr[index*3+1] = scaling[1];
+	m_data->m_instance_scale_ptr[index*3+2] = scaling[2];
+
+	gfxObj->m_numGraphicsInstances++;
+	return gfxObj->m_numGraphicsInstances;
+}
+
+
+int GLInstancingRenderer::registerShape(const float* vertices, int numvertices, const int* indices, int numIndices)
+{
+	btGraphicsInstance* gfxObj = new btGraphicsInstance;
+	
+	if (m_graphicsInstances.size())
+	{
+		btGraphicsInstance* prevObj = m_graphicsInstances[m_graphicsInstances.size()-1];
+		gfxObj->m_instanceOffset = prevObj->m_instanceOffset + prevObj->m_numGraphicsInstances;
+		gfxObj->m_vertexArrayOffset = prevObj->m_vertexArrayOffset + prevObj->m_numVertices;
+	} else
+	{
+		gfxObj->m_instanceOffset = 0;
+	}
+
+	m_graphicsInstances.push_back(gfxObj);
+	gfxObj->m_numIndices = numIndices;
+	gfxObj->m_numVertices = numvertices;
+	
+	
+	glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
+	char* dest=  (char*)glMapBuffer( GL_ARRAY_BUFFER,GL_WRITE_ONLY);//GL_WRITE_ONLY
+	int vertexStrideInBytes = 9*sizeof(float);
+	int sz = numvertices*vertexStrideInBytes;
+	memcpy(dest+vertexStrideInBytes*gfxObj->m_vertexArrayOffset,vertices,sz);
+	glUnmapBuffer( GL_ARRAY_BUFFER);
+
+	glGenBuffers(1, &gfxObj->m_index_vbo);
+
+	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, gfxObj->m_index_vbo);
+	int indexBufferSizeInBytes = gfxObj->m_numIndices*sizeof(int);
+
+	glBufferData(GL_ELEMENT_ARRAY_BUFFER, indexBufferSizeInBytes, NULL, GL_STATIC_DRAW);
+	glBufferSubData(GL_ELEMENT_ARRAY_BUFFER,0,indexBufferSizeInBytes,indices);
+	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+	
+	glGenVertexArrays(1, &gfxObj->m_cube_vao);
+	glBindVertexArray(gfxObj->m_cube_vao);
+	glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
+	glBindVertexArray(0);
+	glBindBuffer(GL_ARRAY_BUFFER,0);
+	glBindVertexArray(0);
+
+	
+	return m_graphicsInstances.size()-1;
+}
+
+
+
+
+void GLInstancingRenderer::InitShaders()
+{
+	
+	int POSITION_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*4);
+	int ORIENTATION_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*4);
+	int COLOR_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*4);
+	int SCALE_BUFFER_SIZE = (MAX_CONVEX_BODIES_CL*sizeof(float)*3);
+
+
+	instancingShader = gltLoadShaderPair(vertexShader,fragmentShader);
+
+	glLinkProgram(instancingShader);
+	glUseProgram(instancingShader);
+	angle_loc = glGetUniformLocation(instancingShader, "angle");
+	ModelViewMatrix = glGetUniformLocation(instancingShader, "ModelViewMatrix");
+	ProjectionMatrix = glGetUniformLocation(instancingShader, "ProjectionMatrix");
+	uniform_texture_diffuse = glGetUniformLocation(instancingShader, "Diffuse");
+
+	GLuint offset = 0;
+
+
+	glGenBuffers(1, &cube_vbo);
+	glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
+
+
+	int size = SHAPE_BUFFER_SIZE  + POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE+COLOR_BUFFER_SIZE+SCALE_BUFFER_SIZE;
+	VBOsize = size;
+
+	glBufferData(GL_ARRAY_BUFFER, size, 0, GL_DYNAMIC_DRAW);//GL_STATIC_DRAW);
+
+	glBindBuffer(GL_ARRAY_BUFFER,0);
+	glBindVertexArray(0);
+
+	glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
+	
+
+}
+
+
+void myinit()
+{
+	GLint err = glGetError();
+
+	//	GLfloat light_ambient[] = { btScalar(0.2), btScalar(0.2), btScalar(0.2), btScalar(1.0) };
+	GLfloat light_ambient[] = { btScalar(1.0), btScalar(1.2), btScalar(0.2), btScalar(1.0) };
+
+	GLfloat light_diffuse[] = { btScalar(1.0), btScalar(1.0), btScalar(1.0), btScalar(1.0) };
+	GLfloat light_specular[] = { btScalar(1.0), btScalar(1.0), btScalar(1.0), btScalar(1.0 )};
+	/*	light_position is NOT default value	*/
+	GLfloat light_position0[] = { btScalar(10000.0), btScalar(10000.0), btScalar(10000.0), btScalar(0.0 )};
+	GLfloat light_position1[] = { btScalar(-1.0), btScalar(-10.0), btScalar(-1.0), btScalar(0.0) };
+
+	glLightfv(GL_LIGHT0, GL_AMBIENT, light_ambient);
+	glLightfv(GL_LIGHT0, GL_DIFFUSE, light_diffuse);
+	glLightfv(GL_LIGHT0, GL_SPECULAR, light_specular);
+	glLightfv(GL_LIGHT0, GL_POSITION, light_position0);
+
+	glLightfv(GL_LIGHT1, GL_AMBIENT, light_ambient);
+	glLightfv(GL_LIGHT1, GL_DIFFUSE, light_diffuse);
+	glLightfv(GL_LIGHT1, GL_SPECULAR, light_specular);
+	glLightfv(GL_LIGHT1, GL_POSITION, light_position1);
+
+	glEnable(GL_LIGHTING);
+	glEnable(GL_LIGHT0);
+	glEnable(GL_LIGHT1);
+
+
+	//	glShadeModel(GL_FLAT);//GL_SMOOTH);
+	glShadeModel(GL_SMOOTH);
+
+	glEnable(GL_DEPTH_TEST);
+	glDepthFunc(GL_LESS);
+
+	glClearColor(float(0.7),float(0.7),float(0.7),float(0));
+	glEnable(GL_LIGHTING);
+	glEnable(GL_LIGHT0);
+
+
+	static bool m_textureenabled = true;
+	static bool m_textureinitialized = false;
+
+
+	if(m_textureenabled)
+	{
+		if(!m_textureinitialized)
+		{
+			glActiveTexture(GL_TEXTURE0);
+
+			GLubyte*	image=new GLubyte[256*256*3];
+			for(int y=0;y<256;++y)
+			{
+				const int	t=y>>5;
+				GLubyte*	pi=image+y*256*3;
+				for(int x=0;x<256;++x)
+				{
+					if (x<2||y<2||x>253||y>253)
+					{
+						pi[0]=0;
+						pi[1]=0;
+						pi[2]=0;
+					} else
+					{
+						pi[0]=255;
+						pi[1]=255;
+						pi[2]=255;
+					}
+
+					/*
+					const int		s=x>>5;
+					const GLubyte	b=180;					
+					GLubyte			c=b+((s+t&1)&1)*(255-b);
+					pi[0]=c;
+					pi[1]=c;
+					pi[2]=c;
+					*/
+
+					pi+=3;
+				}
+			}
+
+			glGenTextures(1,(GLuint*)&m_texturehandle);
+			glBindTexture(GL_TEXTURE_2D,m_texturehandle);
+			glTexEnvf(GL_TEXTURE_ENV,GL_TEXTURE_ENV_MODE,GL_MODULATE);
+			glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_MIN_FILTER,GL_LINEAR_MIPMAP_LINEAR);
+			glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_MAG_FILTER,GL_LINEAR);
+			glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_WRAP_S,GL_REPEAT);
+			glTexParameterf(GL_TEXTURE_2D,GL_TEXTURE_WRAP_T,GL_REPEAT);
+			gluBuild2DMipmaps(GL_TEXTURE_2D,3,256,256,GL_RGB,GL_UNSIGNED_BYTE,image);
+			delete[] image;
+			m_textureinitialized=true;
+		}
+		//		glMatrixMode(GL_TEXTURE);
+		//		glLoadIdentity();
+		//		glMatrixMode(GL_MODELVIEW);
+
+		glEnable(GL_TEXTURE_2D);
+		glBindTexture(GL_TEXTURE_2D,m_texturehandle);
+
+	} else
+	{
+		glDisable(GL_TEXTURE_2D);
+	}
+
+	glEnable(GL_COLOR_MATERIAL);
+	 
+	err = glGetError();
+	assert(err==GL_NO_ERROR);
+
+	//	  glEnable(GL_CULL_FACE);
+	//	  glCullFace(GL_BACK);
+}
+
+void updateCamera() 
+{
+
+
+	
+	btVector3 m_cameraUp(0,1,0);
+	int m_forwardAxis=2;
+	
+
+	glMatrixMode(GL_PROJECTION);
+	glLoadIdentity();
+
+
+	//m_azi+=0.0f;
+
+	btScalar rele = m_ele * btScalar(0.01745329251994329547);// rads per deg
+	btScalar razi = m_azi * btScalar(0.01745329251994329547);// rads per deg
+
+
+		btQuaternion rot(m_cameraUp,razi);
+
+
+	btVector3 eyePos(0,0,0);
+	eyePos[m_forwardAxis] = -m_cameraDistance;
+
+	btVector3 forward(eyePos[0],eyePos[1],eyePos[2]);
+	if (forward.length2() < SIMD_EPSILON)
+	{
+		forward.setValue(1.f,0.f,0.f);
+	}
+	btVector3 right = m_cameraUp.cross(forward);
+	btQuaternion roll(right,-rele);
+
+	eyePos = btMatrix3x3(rot) * btMatrix3x3(roll) * eyePos;
+
+	m_cameraPosition[0] = eyePos.getX();
+	m_cameraPosition[1] = eyePos.getY();
+	m_cameraPosition[2] = eyePos.getZ();
+	m_cameraPosition += m_cameraTargetPosition;
+
+
+	float m_frustumZNear=1;
+	float m_frustumZFar=1000;
+
+	if (m_glutScreenWidth == 0 && m_glutScreenHeight == 0)
+		return;
+
+	float aspect;
+	btVector3 extents;
+
+	if (m_glutScreenWidth > m_glutScreenHeight) 
+	{
+		aspect = m_glutScreenWidth / (float)m_glutScreenHeight;
+		extents.setValue(aspect * 1.0f, 1.0f,0);
+	} else 
+	{
+		aspect = m_glutScreenHeight / (float)m_glutScreenWidth;
+		extents.setValue(1.0f, aspect*1.f,0);
+	}
+
+
+	if (m_ortho)
+	{
+		// reset matrix
+		glLoadIdentity();
+		extents *= m_cameraDistance;
+		btVector3 lower = m_cameraTargetPosition - extents;
+		btVector3 upper = m_cameraTargetPosition + extents;
+		glOrtho(lower.getX(), upper.getX(), lower.getY(), upper.getY(),-1000,1000);
+
+		glMatrixMode(GL_MODELVIEW);
+		glLoadIdentity();
+	} else
+	{
+		if (m_glutScreenWidth > m_glutScreenHeight) 
+		{
+			glFrustum (-aspect * m_frustumZNear, aspect * m_frustumZNear, -m_frustumZNear, m_frustumZNear, m_frustumZNear, m_frustumZFar);
+		} else 
+		{
+			glFrustum (-aspect * m_frustumZNear, aspect * m_frustumZNear, -m_frustumZNear, m_frustumZNear, m_frustumZNear, m_frustumZFar);
+		}
+		glMatrixMode(GL_MODELVIEW);
+		glLoadIdentity();
+		gluLookAt(m_cameraPosition[0], m_cameraPosition[1], m_cameraPosition[2], 
+			m_cameraTargetPosition[0], m_cameraTargetPosition[1], m_cameraTargetPosition[2], 
+			m_cameraUp.getX(),m_cameraUp.getY(),m_cameraUp.getZ());
+	}
+
+}
+
+
+void GLInstancingRenderer::RenderScene(void)
+{
+	 BT_PROFILE("GlutDisplayFunc");
+
+	myinit();
+
+	updateCamera();
+
+	//render coordinate system
+	glBegin(GL_LINES);
+	glColor3f(1,0,0);
+	glVertex3f(0,0,0);
+	glVertex3f(1,0,0);
+	glColor3f(0,1,0);
+	glVertex3f(0,0,0);
+	glVertex3f(0,1,0);
+	glColor3f(0,0,1);
+	glVertex3f(0,0,0);
+	glVertex3f(0,0,1);
+	glEnd();
+
+	//do a finish, to make sure timings are clean
+	//	glFinish();
+
+
+
+	//	glBindBuffer(GL_ARRAY_BUFFER, 0);
+	glBindBuffer(GL_ARRAY_BUFFER, cube_vbo);
+	glFlush();
+
+	//updatePos();
+
+//	simulationLoop();
+
+	//useCPU = true;
+
+	int totalNumInstances = 0;
+
+	for (int i=0;i<m_graphicsInstances.size();i++)
+	{
+		totalNumInstances+=m_graphicsInstances[i]->m_numGraphicsInstances;
+	}
+
+	int curOffset = 0;
+
+	for (int i=0;i<m_graphicsInstances.size();i++)
+	{
+		
+		btGraphicsInstance* gfxObj = m_graphicsInstances[i];
+		int myOffset = gfxObj->m_instanceOffset*4*sizeof(float);
+
+		int POSITION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
+		int ORIENTATION_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
+		int COLOR_BUFFER_SIZE = (totalNumInstances*sizeof(float)*4);
+		int SCALE_BUFFER_SIZE = (totalNumInstances*sizeof(float)*3);
+
+		glBindVertexArray(gfxObj->m_cube_vao);
+
+		
+		int vertexStride = 9*sizeof(float);
+		int vertexBase = gfxObj->m_vertexArrayOffset*vertexStride;
+
+		glVertexAttribPointer(0, 4, GL_FLOAT, GL_FALSE, 9*sizeof(float), (GLvoid*)vertexBase);
+		glVertexAttribPointer(1, 4, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*4*sizeof(float)+SHAPE_BUFFER_SIZE));
+		glVertexAttribPointer(2, 4, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*4*sizeof(float)+SHAPE_BUFFER_SIZE+POSITION_BUFFER_SIZE));
+		int uvoffset = 7*sizeof(float)+vertexBase;
+		int normaloffset = 4*sizeof(float)+vertexBase;
+
+		glVertexAttribPointer(3, 2, GL_FLOAT, GL_FALSE, 9*sizeof(float), (GLvoid *)uvoffset);
+		glVertexAttribPointer(4, 3, GL_FLOAT, GL_FALSE, 9*sizeof(float), (GLvoid *)normaloffset);
+		glVertexAttribPointer(5, 4, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*4*sizeof(float)+SHAPE_BUFFER_SIZE+POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE));
+		glVertexAttribPointer(6, 3, GL_FLOAT, GL_FALSE, 0, (GLvoid *)(curOffset*3*sizeof(float)+SHAPE_BUFFER_SIZE+POSITION_BUFFER_SIZE+ORIENTATION_BUFFER_SIZE+COLOR_BUFFER_SIZE));
+
+		glEnableVertexAttribArray(0);
+		glEnableVertexAttribArray(1);
+		glEnableVertexAttribArray(2);
+		glEnableVertexAttribArray(3);
+		glEnableVertexAttribArray(4);
+		glEnableVertexAttribArray(5);
+		glEnableVertexAttribArray(6);
+
+		glVertexAttribDivisor(0, 0);
+		glVertexAttribDivisor(1, 1);
+		glVertexAttribDivisor(2, 1);
+		glVertexAttribDivisor(3, 0);
+		glVertexAttribDivisor(4, 0);
+		glVertexAttribDivisor(5, 1);
+		glVertexAttribDivisor(6, 1);
+	
+		glUseProgram(instancingShader);
+		glUniform1f(angle_loc, 0);
+		GLfloat pm[16];
+		glGetFloatv(GL_PROJECTION_MATRIX, pm);
+		glUniformMatrix4fv(ProjectionMatrix, 1, false, &pm[0]);
+
+		GLfloat mvm[16];
+		glGetFloatv(GL_MODELVIEW_MATRIX, mvm);
+		glUniformMatrix4fv(ModelViewMatrix, 1, false, &mvm[0]);
+
+		glUniform1i(uniform_texture_diffuse, 0);
+
+		glFlush();
+
+		if (gfxObj->m_numGraphicsInstances)
+		{
+			int indexCount = gfxObj->m_numIndices;
+			int indexOffset = 0;
+
+			glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, gfxObj->m_index_vbo);
+			{
+				BT_PROFILE("glDrawElementsInstanced");
+				glDrawElementsInstanced(GL_TRIANGLES, indexCount, GL_UNSIGNED_INT, (void*)indexOffset, gfxObj->m_numGraphicsInstances);
+			}
+		}
+		curOffset+= gfxObj->m_numGraphicsInstances;
+	}
+	glUseProgram(0);
+	glBindBuffer(GL_ARRAY_BUFFER,0);
+	glBindVertexArray(0);
+
+	
+	GLint err = glGetError();
+	assert(err==GL_NO_ERROR);
+}
+
+
+void GLInstancingRenderer::CleanupShaders()
+{
+	
+	delete []m_data->m_instance_positions_ptr;
+	delete []m_data->m_instance_quaternion_ptr;
+	delete []m_data->m_instance_colors_ptr;
+	delete []m_data->m_instance_scale_ptr;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GLInstancingRenderer.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GLInstancingRenderer.h
@@ -0,0 +1,45 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef GL_INSTANCING_RENDERER_H
+#define GL_INSTANCING_RENDERER_H
+
+#include "LinearMath/btAlignedObjectArray.h"
+
+class GLInstancingRenderer
+{
+	
+	btAlignedObjectArray<struct btGraphicsInstance*> m_graphicsInstances;
+
+	struct InternalDataRenderer* m_data;
+
+public:
+	GLInstancingRenderer();
+	virtual ~GLInstancingRenderer();
+
+	void InitShaders();
+	void RenderScene(void);
+	void CleanupShaders();
+
+	///vertices must be in the format x,y,z, nx,ny,nz, u,v
+	int registerShape(const float* vertices, int numvertices, const int* indices, int numIndices);
+
+	///position x,y,z, quaternion x,y,z,w, color r,g,b,a, scaling x,y,z
+	int registerGraphicsInstance(int shapeIndex, const float* position, const float* quaternion, const float* color, const float* scaling);
+
+	void writeTransforms();
+};
+
+#endif //GL_INSTANCING_RENDERER_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GlutRenderer.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GlutRenderer.cpp
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+#include <GL/glew.h>
+#include "GlutRenderer.h"
+#include <stdio.h>
+
+
+GlutRenderer* GlutRenderer::gDemoApplication;
+
+
+
+void GlutRenderer::runMainLoop()
+{
+	glutMainLoop();
+
+}
+
+
+static	void glutKeyboardCallback(unsigned char key, int x, int y) {	GlutRenderer::gDemoApplication->keyboardCallback(key,x,y); }
+static	void glutKeyboardUpCallback(unsigned char key, int x, int y){  GlutRenderer::gDemoApplication->keyboardUpCallback(key,x,y);}
+static void glutSpecialKeyboardCallback(int key, int x, int y){	GlutRenderer::gDemoApplication->specialKeyboard(key,x,y);}
+static void glutSpecialKeyboardUpCallback(int key, int x, int y){	GlutRenderer::gDemoApplication->specialKeyboardUp(key,x,y);}
+static void glutReshapeCallback(int w, int h){	GlutRenderer::gDemoApplication->resize(w,h);}
+static void glutIdleCallback(){ glutPostRedisplay (); }
+static void glutMouseFuncCallback(int button, int state, int x, int y){	GlutRenderer::gDemoApplication->mouseFunc(button,state,x,y);}
+static void	glutMotionFuncCallback(int x,int y){	GlutRenderer::gDemoApplication->mouseMotionFunc(x,y);}
+static void glutDisplayCallback(void){	GlutRenderer::gDemoApplication->displayCallback();}
+
+
+void GlutRenderer::resize(int width, int height)
+{
+	m_glutScreenWidth = width;
+	m_glutScreenHeight = height;
+}
+
+void GlutRenderer::mouseFunc(int button, int state, int x, int y)
+{
+}
+void	GlutRenderer::mouseMotionFunc(int x,int y)
+{
+}
+
+void GlutRenderer::renderScene()
+{
+	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+	glutSwapBuffers();
+	glutPostRedisplay();
+
+	GLint err = glGetError();
+	assert(err==GL_NO_ERROR);
+}
+
+void GlutRenderer::displayCallback()
+{
+	updateScene();
+	
+	renderScene();
+}
+
+GlutRenderer::GlutRenderer(int argc, char* argv[])
+{
+	glutInit(&argc, argv);
+	gDemoApplication = this;
+}
+
+void GlutRenderer::initGraphics(int width, int height)
+{
+	m_glutScreenWidth = width;
+	m_glutScreenHeight = height;
+		
+	glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA);
+
+	glutInitWindowSize(m_glutScreenWidth, m_glutScreenHeight);
+	glutCreateWindow("GPU rigid body pipeline2");
+	glutKeyboardFunc(glutKeyboardCallback);
+	glutKeyboardUpFunc(glutKeyboardUpCallback);
+	glutSpecialFunc(glutSpecialKeyboardCallback);
+	glutSpecialUpFunc(glutSpecialKeyboardUpCallback);
+	glutReshapeFunc(glutReshapeCallback);
+	glutIdleFunc(glutIdleCallback);
+	glutMouseFunc(glutMouseFuncCallback);
+	glutPassiveMotionFunc(glutMotionFuncCallback);
+	glutMotionFunc(glutMotionFuncCallback);
+	glutDisplayFunc( glutDisplayCallback );
+
+	GLenum err = glewInit();
+	if (GLEW_OK != err)
+	{
+		printf("Error: %s\n", glewGetErrorString(err));
+	}
+
+	glClearColor(0.6f,0.6f,1.f,1.f);
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GlutRenderer.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/GlutRenderer.h
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+#ifndef GLUT_RENDERER_H
+#define GLUT_RENDERER_H
+
+#include "btGlutInclude.h"
+#include "LinearMath/btVector3.h"
+
+struct GlutRenderer
+{
+	static GlutRenderer* gDemoApplication;
+	int m_glutScreenWidth;
+	int m_glutScreenHeight;
+
+	btVector3 m_cameraPosition;
+	btVector3 m_cameraTargetPosition;
+	btScalar m_cameraDistance;
+	btVector3 m_cameraUp;
+	float m_azimuth;
+	float m_elevation;
+
+
+	GlutRenderer(int argc, char* argv[]);
+	
+	virtual void initGraphics(int width, int height);
+	virtual void cleanup() {}
+	
+	void runMainLoop();
+
+	virtual void updateScene(){};
+	
+	virtual void renderScene();
+
+	virtual void	keyboardCallback(unsigned char key, int x, int y) {};
+	virtual void	keyboardUpCallback(unsigned char key, int x, int y) {}
+	virtual void	specialKeyboard(int key, int x, int y){}
+	virtual void	specialKeyboardUp(int key, int x, int y){}
+	virtual void	resize(int w, int h);
+	virtual void	mouseFunc(int button, int state, int x, int y);
+	virtual void	mouseMotionFunc(int x,int y);
+	virtual void displayCallback();
+	
+
+};
+
+#endif //GLUT_RENDERER_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/NVIDIA/premake4.lua
@@ -0,0 +1,64 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_gpu_rigidbody_pipeline2_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlew()
+
+
+			includedirs {
+		"../../primitives",
+		"../../../bullet2"
+		}
+		
+		files {
+			"../main.cpp",
+			"../CLPhysicsDemo.cpp",
+			"../CLPhysicsDemo.h",
+			"../GLInstancingRenderer.cpp",
+			"../GLInstancingRenderer.h",
+			"../GlutRenderer.cpp",
+			"../GlutRenderer.h",
+			"../Win32OpenGLRenderManager.cpp",
+			"../Win32OpenGLRenderManager.h",	
+			"../../gpu_rigidbody_pipeline/btConvexUtility.cpp",
+			"../../gpu_rigidbody_pipeline/btConvexUtility.h",
+			"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.cpp",
+			"../../gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.cpp",
+			"../../../dynamics/basic_demo/ConvexHeightFieldShape.h",
+			"../../../bullet2/LinearMath/btConvexHullComputer.cpp",
+			"../../../bullet2/LinearMath/btConvexHullComputer.h",
+			"../../broadphase_benchmark/findPairsOpenCL.cpp",
+			"../../broadphase_benchmark/findPairsOpenCL.h",
+			"../../broadphase_benchmark/btGridBroadphaseCL.cpp",
+			"../../broadphase_benchmark/btGridBroadphaseCL.h",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp",
+			"../../3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp",
+			"../../3dGridBroadphase/Shared/btGpu3DGridBroadphase.h",
+			"../../../bullet2/LinearMath/btAlignedAllocator.cpp",
+			"../../../bullet2/LinearMath/btQuickprof.cpp",
+			"../../../bullet2/LinearMath/btQuickprof.h",
+			"../../../bullet2/BulletCollision/BroadphaseCollision/btBroadphaseProxy.cpp",
+			"../../../bullet2/BulletCollision/BroadphaseCollision/btOverlappingPairCache.cpp",
+			"../../../bullet2/BulletCollision/BroadphaseCollision/btSimpleBroadphase.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/OpenGLInclude.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/OpenGLInclude.h
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+#ifndef __OPENGL_INCLUDE_H
+#define __OPENGL_INCLUDE_H
+
+#include <GL/glew.h>
+
+//think different
+#if defined(__APPLE__) && !defined (VMDMESA)
+#include <OpenGL/OpenGL.h>
+#include <OpenGL/gl.h>
+#include <OpenGL/glu.h>
+#else
+
+
+#ifdef _WINDOWS
+#include <windows.h>
+#include <GL/gl.h>
+#include <GL/glu.h>
+#else
+#include <GL/gl.h>
+
+#endif //_WINDOWS
+#endif //APPLE
+
+#endif //__OPENGL_INCLUDE_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/ShapeData.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/ShapeData.h
@@ -0,0 +1,210 @@
+#ifndef SHAPE_DATA_H
+#define SHAPE_DATA_H
+
+static float barrel_vertices[] = {
+0.0f,-0.5f,0.0f,				      1.0f,  0.0f,-1.0f,0.0f,                           				0.5f,	0.5f,
+0.282362f,-0.5f,-0.205148f,   1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+0.349018f,-0.5f,0.0f,         1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+0.107853f,-0.5f,-0.331936f,   1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+-0.107853f,-0.5f,-0.331936f,  1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+0.107853f,-0.5f,-0.331936f,   1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+-0.282362f,-0.5f,-0.205148f,  1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+-0.349018f,-0.5f,0.0f,        1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+-0.282362f,-0.5f,0.205148f,   1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+-0.107853f,-0.5f,0.331936f,   1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+0.107853f,-0.5f,0.331936f,    1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+0.282362f,-0.5f,0.205148f,    1.0f,  0.0f,-1.0f,0.0f,                                   0.5f, 0.5f,
+0.0f,0.5f,0.0f,               1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+0.349018f,0.5f,0.0f,          1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+0.282362f,0.5f,-0.205148f,    1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+0.107853f,0.5f,-0.331936f,    1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+0.107853f,0.5f,-0.331936f,    1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+-0.107853f,0.5f,-0.331936f,   1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+-0.282362f,0.5f,-0.205148f,   1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+-0.349018f,0.5f,0.0f,         1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+-0.282362f,0.5f,0.205148f,    1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+-0.107853f,0.5f,0.331936f,    1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+0.107853f,0.5f,0.331936f,     1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+0.282362f,0.5f,0.205148f,     1.0f,  0.0f,1.0f,0.0f,                                    0.5f, 0.5f,
+0.349018f,-0.5f,0.0f,         1.0f,  0.957307f,-0.289072f,0.0f,                         0.5f, 0.5f,
+0.404509f,0.0f,-0.293893f,    1.0f,  0.809017f,0.0f,-0.587785f,                         0.5f, 0.5f,
+0.5f,0.0f,0.0f,               1.0f,  1.0f,0.0f,0.0f,                                    0.5f, 0.5f,
+0.282362f,-0.5f,-0.205148f,   1.0f,  0.774478f,-0.289072f,-0.562691f,                   0.5f, 0.5f,
+0.154508f,0.0f,-0.475528f,    1.0f,  0.309017f,0.0f,-0.951057f,                         0.5f, 0.5f,
+0.107853f,-0.5f,-0.331936f,   1.0f,  0.295824f,-0.289072f,-0.910453f,                   0.5f, 0.5f,
+0.107853f,-0.5f,-0.331936f,   1.0f,  0.295824f,-0.289072f,-0.910453f,                   0.5f, 0.5f,
+-0.154509f,0.0f,-0.475528f,   1.0f,  -0.309017f,0.0f,-0.951057f,                        0.5f, 0.5f,
+0.154508f,0.0f,-0.475528f,    1.0f,  0.309017f,0.0f,-0.951057f,                         0.5f, 0.5f,
+-0.107853f,-0.5f,-0.331936f,  1.0f,  -0.295824f,-0.289072f,-0.910453f,                  0.5f, 0.5f,
+-0.404509f,0.0f,-0.293893f,   1.0f,  -0.809017f,0.0f,-0.587785f,                        0.5f, 0.5f,
+-0.282362f,-0.5f,-0.205148f,  1.0f,  -0.774478f,-0.289072f,-0.562691f,                  0.5f, 0.5f,
+-0.5f,0.0f,0.0f,              1.0f,  -1.0f,0.0f,0.0f,                                   0.5f, 0.5f,
+-0.349018f,-0.5f,0.0f,        1.0f,  -0.957307f,-0.289072f,0.0f,                        0.5f, 0.5f,
+-0.404508f,0.0f,0.293893f,    1.0f,  -0.809017f,0.0f,0.587785f,                         0.5f, 0.5f,
+-0.282362f,-0.5f,0.205148f,   1.0f,  -0.774478f,-0.289072f,0.562691f,                   0.5f, 0.5f,
+-0.154509f,0.0f,0.475528f,    1.0f,  -0.309017f,0.0f,0.951056f,                         0.5f, 0.5f,
+-0.107853f,-0.5f,0.331936f,   1.0f,  -0.295824f,-0.289072f,0.910453f,                   0.5f, 0.5f,
+0.154509f,0.0f,0.475528f,     1.0f,  0.309017f,0.0f,0.951056f,                          0.5f, 0.5f,
+0.107853f,-0.5f,0.331936f,    1.0f,  0.295824f,-0.289072f,0.910453f,                    0.5f, 0.5f,
+0.404509f,0.0f,0.293892f,     1.0f,  0.809017f,0.0f,0.587785f,                          0.5f, 0.5f,
+0.282362f,-0.5f,0.205148f,    1.0f,  0.774478f,-0.289072f,0.562691f,                    0.5f, 0.5f,
+0.282362f,0.5f,-0.205148f,    1.0f,  0.774478f,0.289072f,-0.562691f,                    0.5f, 0.5f,
+0.349018f,0.5f,0.0f,          1.0f,  0.957307f,0.289072f,0.0f,                          0.5f, 0.5f,
+0.107853f,0.5f,-0.331936f,    1.0f,  0.295824f,0.289072f,-0.910453f,                    0.5f, 0.5f,
+-0.107853f,0.5f,-0.331936f,   1.0f,  -0.295824f,0.289072f,-0.910453f,                   0.5f, 0.5f,
+0.107853f,0.5f,-0.331936f,    1.0f,  0.295824f,0.289072f,-0.910453f,                    0.5f, 0.5f,
+-0.282362f,0.5f,-0.205148f,   1.0f,  -0.774478f,0.289072f,-0.562691f,                   0.5f, 0.5f,
+-0.349018f,0.5f,0.0f,         1.0f,  -0.957307f,0.289072f,0.0f,                         0.5f, 0.5f,
+-0.282362f,0.5f,0.205148f,    1.0f,  -0.774478f,0.289072f,0.562691f,                    0.5f, 0.5f,
+-0.107853f,0.5f,0.331936f,    1.0f,  -0.295824f,0.289072f,0.910453f,                    0.5f, 0.5f,
+0.107853f,0.5f,0.331936f,     1.0f,  0.295824f,0.289072f,0.910453f,                     0.5f, 0.5f,
+0.282362f,0.5f,0.205148f,     1.0f,  0.774478f,0.289072f,0.562691f,                     0.5f, 0.5f
+};                            
+       
+
+
+static int barrel_indices[] = {
+0,1,2,
+0,3,1,
+0,4,5,
+0,6,4,
+0,7,6,
+0,8,7,
+0,9,8,
+0,10,9,
+0,11,10,
+0,2,11,
+12,13,14,
+12,14,15,
+12,16,17,
+12,17,18,
+12,18,19,
+12,19,20,
+12,20,21,
+12,21,22,
+12,22,23,
+12,23,13,
+24,25,26,
+24,27,25,
+27,28,25,
+27,29,28,
+30,31,32,
+30,33,31,
+33,34,31,
+33,35,34,
+35,36,34,
+35,37,36,
+37,38,36,
+37,39,38,
+39,40,38,
+39,41,40,
+41,42,40,
+41,43,42,
+43,44,42,
+43,45,44,
+45,26,44,
+45,24,26,
+26,46,47,
+26,25,46,
+25,48,46,
+25,28,48,
+32,49,50,
+32,31,49,
+31,51,49,
+31,34,51,
+34,52,51,
+34,36,52,
+36,53,52,
+36,38,53,
+38,54,53,
+38,40,54,
+40,55,54,
+40,42,55,
+42,56,55,
+42,44,56,
+44,47,56,
+44,26,47,
+};
+
+
+///position xyz, unused w, normal, uv
+static const float cube_vertices[] =
+{
+	-0.5f, -0.5f, 0.5f, 0.0f,	0,0,1,	0,0,//0
+	0.5f, -0.5f, 0.5f, 0.0f,	0,0,1,	1,0,//1
+	0.5f,  0.5f, 0.5f, 0.0f,	0,0,1,	1,1,//2
+	-0.5f,  0.5f, 0.5f, 0.0f,	0,0,1,	0,1	,//3
+
+	-0.5f, -0.5f, -0.5f, 0.5f,	0,0,-1,	0,0,//4
+	0.5f, -0.5f, -0.5f, 0.5f,	0,0,-1,	1,0,//5
+	0.5f,  0.5f, -0.5f, 0.5f,	0,0,-1,	1,1,//6
+	-0.5f,  0.5f, -0.5f, 0.5f,	0,0,-1,	0,1,//7
+
+	-0.5f, -0.5f, -0.5f, 0.5f,	-1,0,0,	0,0,
+	-0.5f, 0.5f, -0.5f, 0.5f,	-1,0,0,	1,0,
+	-0.5f,  0.5f, 0.5f, 0.5f,	-1,0,0,	1,1,
+	-0.5f,  -0.5f, 0.5f, 0.5f,	-1,0,0,	0,1,
+
+	0.5f, -0.5f, -0.5f, 0.5f,	1,0,0,	0,0,
+	0.5f, 0.5f, -0.5f, 0.5f,	1,0,0,	1,0,
+	0.5f,  0.5f, 0.5f, 0.5f,	1,0,0,	1,1,
+	0.5f,  -0.5f, 0.5f, 0.5f,	1,0,0,	0,1,
+
+	-0.5f, -0.5f,  -0.5f, 0.5f,	0,-1,0,	0,0,
+	-0.5f, -0.5f, 0.5f, 0.5f,	0,-1,0,	1,0,
+	0.5f, -0.5f,  0.5f, 0.5f,	0,-1,0,	1,1,
+	0.5f,-0.5f,  -0.5f,  0.5f,	0,-1,0,	0,1,
+
+	-0.5f, 0.5f,  -0.5f, 0.5f,	0,1,0,	0,0,
+	-0.5f, 0.5f, 0.5f, 0.5f,	0,1,0,	1,0,
+	0.5f, 0.5f,  0.5f, 0.5f,	0,1,0,	1,1,
+	0.5f,0.5f,  -0.5f,  0.5f,	0,1,0,	0,1,
+};
+
+
+///position xyz, unused w, normal, uv
+static const float cube_vertices2[] =
+{
+	-1.5f,	-0.5f,	0.5f, 0.0f,	0,0,1,	0,0,//0
+	1.5f,	-0.5f,	 0.5f, 0.0f,	0,0,1,	1,0,//1
+	1.5f,	0.5f,			0.5f, 0.0f,	0,0,1,	1,1,//2
+	-1.5f,  0.5f,		0.5f, 0.0f,	0,0,1,	0,1	,//3
+
+	-1.5f,	-0.5f, -0.5f, 0.5f,	0,0,-1,	0,0,//4
+	1.5f,	-0.5f, 		-0.5f, 0.5f,	0,0,-1,	1,0,//5
+	1.5f,	0.5f,			-0.5f, 0.5f,	0,0,-1,	1,1,//6
+	-1.5f,	0.5f,		-0.5f, 0.5f,	0,0,-1,	0,1,//7
+
+	-1.5f,	-0.5f, -0.5f, 0.5f,	-1,0,0,	0,0,
+	-1.5f,	0.5f,		-0.5f, 0.5f,	-1,0,0,	1,0,
+	-1.5f,  0.5f,		0.5f, 0.5f,	-1,0,0,	1,1,
+	-1.5f,  -0.5f,	0.5f, 0.5f,	-1,0,0,	0,1,
+
+	1.5f,	-0.5f, 		-0.5f, 0.5f,	1,0,0,	0,0,
+	1.5f,	0.5f,			-0.5f, 0.5f,	1,0,0,	1,0,
+	1.5f,	0.5f,			0.5f, 0.5f,	1,0,0,	1,1,
+	1.5f,  -0.5f,		0.5f, 0.5f,	1,0,0,	0,1,
+
+	-1.5f, -0.5f,  -0.5f, 0.5f,	0,-1,0,	0,0,
+	-1.5f, -0.5f,		0.5f, 0.5f,	0,-1,0,	1,0,
+	1.5f,	-0.5f,  	0.5f, 0.5f,	0,-1,0,	1,1,
+	1.5f,	-0.5f,  	-0.5f,  0.5f,	0,-1,0,	0,1,
+
+	-1.5f,	0.5f,  -0.5f, 0.5f,	0,1,0,	0,0,
+	-1.5f,	0.5f,		0.5f, 0.5f,	0,1,0,	1,0,
+	1.5f,	0.5f,  		0.5f, 0.5f,	0,1,0,	1,1,
+	1.5f,	0.5f,  	-0.5f,  0.5f,	0,1,0,	0,1,
+};
+
+
+static const int cube_indices[]=
+{
+	0,1,2,0,2,3,//ground face
+	4,5,6,4,6,7,//top face
+	8,9,10,8,10,11,
+	12,13,14,12,14,15,
+	16,17,18,16,18,19,
+	20,21,22,20,22,23
+};
+
+#endif //SHAPE_DATA_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/Win32OpenGLRenderManager.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/Win32OpenGLRenderManager.cpp
@@ -0,0 +1,465 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+#include "Win32OpenGLRenderManager.h"
+
+#include <windows.h>
+#include <GL/gl.h>
+
+static InternalData2* sData = 0;
+
+struct InternalData2
+{
+	HWND m_hWnd;;
+	int m_width;
+	int m_height;
+	HDC m_hDC;
+	HGLRC m_hRC;
+	bool m_OpenGLInitialized;
+	int m_oldScreenWidth;
+	int m_oldHeight;
+	int m_oldBitsPerPel;
+	bool m_quit;
+
+	
+	InternalData2()
+	{
+		m_hWnd = 0;
+		m_width = 0;
+		m_height = 0;
+		m_hDC = 0;
+		m_hRC = 0;
+		m_OpenGLInitialized = false;
+		m_oldScreenWidth = 0;
+		m_oldHeight = 0;
+		m_oldBitsPerPel = 0;
+		m_quit = false;
+	}
+};
+
+
+void Win32OpenGLWindow::enableOpenGL()
+{
+	
+	
+	
+	PIXELFORMATDESCRIPTOR pfd;
+	int format;
+	
+	// get the device context (DC)
+	m_data->m_hDC = GetDC( m_data->m_hWnd );
+	
+	// set the pixel format for the DC
+	ZeroMemory( &pfd, sizeof( pfd ) );
+	pfd.nSize = sizeof( pfd );
+	pfd.nVersion = 1;
+	pfd.dwFlags = PFD_DRAW_TO_WINDOW | PFD_SUPPORT_OPENGL | PFD_DOUBLEBUFFER;
+	pfd.iPixelType = PFD_TYPE_RGBA;
+	pfd.cColorBits = 24;
+	pfd.cDepthBits = 16;
+	pfd.cStencilBits = 1;
+	pfd.iLayerType = PFD_MAIN_PLANE;
+	format = ChoosePixelFormat( m_data->m_hDC, &pfd );
+	SetPixelFormat( m_data->m_hDC, format, &pfd );
+	
+	// create and enable the render context (RC)
+	m_data->m_hRC = wglCreateContext( m_data->m_hDC );
+	wglMakeCurrent( m_data->m_hDC, m_data->m_hRC );
+	m_data->m_OpenGLInitialized = true;
+	
+	
+}
+
+
+void Win32OpenGLWindow::disableOpenGL()
+{
+	m_data->m_OpenGLInitialized = false;
+
+	wglMakeCurrent( NULL, NULL );
+	wglDeleteContext( m_data->m_hRC );
+	ReleaseDC( m_data->m_hWnd, m_data->m_hDC );
+}
+
+void Win32OpenGLWindow::pumpMessage()
+{
+	MSG msg;
+		// check for messages
+		if ( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE )  )
+		{
+			
+			// handle or dispatch messages
+			if ( msg.message == WM_QUIT ) 
+			{
+				m_data->m_quit = TRUE;
+			} 
+			else 
+			{
+				TranslateMessage( &msg );
+				DispatchMessage( &msg );
+			}
+			
+//			gDemoApplication->displayCallback();
+			
+
+		};
+}
+
+
+
+LRESULT CALLBACK WndProc(HWND hWnd, UINT message, WPARAM wParam, LPARAM lParam)
+{
+	switch (message)
+	{
+	case WM_PAINT:
+		{
+			PAINTSTRUCT ps;
+			BeginPaint(hWnd, &ps);
+			EndPaint(hWnd, &ps);
+		}
+		return 0;
+
+	case WM_ERASEBKGND:
+		return 0;
+	
+	case WM_DESTROY:
+		PostQuitMessage(0);
+		return 0;
+
+	case WM_KEYDOWN:
+		{
+			switch ( wParam )
+			{
+				case 'Q':
+				case VK_ESCAPE:
+					{
+						PostQuitMessage(0);
+					}
+					return 0;
+			}
+			break;
+		}
+
+	case WM_SIZE:													// Size Action Has Taken Place
+
+			switch (wParam)												// Evaluate Size Action
+			{
+				case SIZE_MINIMIZED:									// Was Window Minimized?
+				return 0;												// Return
+
+				case SIZE_MAXIMIZED:									// Was Window Maximized?
+
+					sData->m_width = LOWORD (lParam);
+					sData->m_height = HIWORD (lParam);
+					//if (sOpenGLInitialized)
+					//{
+					//	//gDemoApplication->reshape(sWidth,sHeight);
+					//}
+					glViewport(0, 0, sData->m_width, sData->m_height);
+				return 0;												// Return
+
+				case SIZE_RESTORED:										// Was Window Restored?
+					sData->m_width = LOWORD (lParam);
+					sData->m_height = HIWORD (lParam);
+					//if (sOpenGLInitialized)
+					//{
+					//	gDemoApplication->reshape(sWidth,sHeight);
+					//}
+					glViewport(0, 0, sData->m_width, sData->m_height);
+				return 0;												// Return
+			}
+		break;
+
+	default:{
+
+			}
+	};
+
+	return DefWindowProc(hWnd, message, wParam, lParam);
+}
+
+
+
+
+void	Win32OpenGLWindow::init(int width,int height, bool fullscreen,int colorBitsPerPixel, void* windowHandle)
+{
+	// get handle to exe file
+	HINSTANCE hInstance = GetModuleHandle(0);
+
+	// create the window if we need to and we do not use the null device
+	if (!windowHandle)
+	{
+		const char* ClassName = "DeviceWin32";
+
+		// Register Class
+		WNDCLASSEX wcex;
+		wcex.cbSize		= sizeof(WNDCLASSEX);
+		wcex.style		= CS_HREDRAW | CS_VREDRAW;
+		wcex.lpfnWndProc	= WndProc;
+		wcex.cbClsExtra		= 0;
+		wcex.cbWndExtra		= 0;
+		wcex.hInstance		= hInstance;
+		wcex.hIcon		= LoadIcon( NULL, IDI_APPLICATION ); //(HICON)LoadImage(hInstance, "bullet_ico.ico", IMAGE_ICON, 0,0, LR_LOADTRANSPARENT);//LR_LOADFROMFILE);
+		wcex.hCursor		= LoadCursor(NULL, IDC_ARROW);
+		wcex.hbrBackground	= (HBRUSH)(COLOR_WINDOW+1);
+		wcex.lpszMenuName	= 0;
+		wcex.lpszClassName	= ClassName;
+		wcex.hIconSm		= 0;
+
+		// if there is an icon, load it
+		wcex.hIcon = (HICON)LoadImage(hInstance, "irrlicht.ico", IMAGE_ICON, 0,0, LR_LOADFROMFILE);
+
+		RegisterClassEx(&wcex);
+
+		// calculate client size
+
+		RECT clientSize;
+		clientSize.top = 0;
+		clientSize.left = 0;
+		clientSize.right = width;
+		clientSize.bottom = height;
+
+		DWORD style = WS_POPUP;
+
+		if (!fullscreen)
+			style = WS_SYSMENU | WS_BORDER | WS_CAPTION | WS_CLIPCHILDREN | WS_CLIPSIBLINGS | WS_MINIMIZEBOX | WS_MAXIMIZEBOX | WS_SIZEBOX;
+
+		AdjustWindowRect(&clientSize, style, FALSE);
+
+		m_data->m_width = clientSize.right - clientSize.left;
+		m_data->m_height = clientSize.bottom - clientSize.top;
+
+		int windowLeft = (GetSystemMetrics(SM_CXSCREEN) - m_data->m_width) / 2;
+		int windowTop = (GetSystemMetrics(SM_CYSCREEN) - m_data->m_height) / 2;
+
+		if (fullscreen)
+		{
+			windowLeft = 0;
+			windowTop = 0;
+		}
+
+		// create window
+
+		m_data->m_hWnd = CreateWindow( ClassName, "", style, windowLeft, windowTop,
+					m_data->m_width, m_data->m_height, NULL, NULL, hInstance, NULL);
+
+		ShowWindow(m_data->m_hWnd, SW_SHOW);
+		UpdateWindow(m_data->m_hWnd);
+
+		MoveWindow(m_data->m_hWnd, windowLeft, windowTop, m_data->m_width, m_data->m_height, TRUE);
+	}
+	else if (windowHandle)
+	{
+		// attach external window
+		m_data->m_hWnd = static_cast<HWND>(windowHandle);
+		RECT r;
+		GetWindowRect(m_data->m_hWnd, &r);
+		m_data->m_width = r.right - r.left;
+		m_data->m_height = r.bottom - r.top;
+		//sFullScreen = false;
+		//sExternalWindow = true;
+	}
+
+
+	if (fullscreen)
+	{
+		DEVMODE dm;
+		memset(&dm, 0, sizeof(dm));
+		dm.dmSize = sizeof(dm);
+		// use default values from current setting
+		EnumDisplaySettings(NULL, ENUM_CURRENT_SETTINGS, &dm);
+		m_data->m_oldScreenWidth = dm.dmPelsWidth;
+		m_data->m_oldHeight = dm.dmPelsHeight;
+		m_data->m_oldBitsPerPel = dm.dmBitsPerPel;
+
+		dm.dmPelsWidth = width;
+		dm.dmPelsHeight = height;
+		if (colorBitsPerPixel)
+		{
+			dm.dmBitsPerPel = colorBitsPerPixel;
+		}
+		dm.dmFields = DM_BITSPERPEL | DM_PELSWIDTH | DM_PELSHEIGHT | DM_DISPLAYFREQUENCY;
+
+		LONG res = ChangeDisplaySettings(&dm, CDS_FULLSCREEN);
+		if (res != DISP_CHANGE_SUCCESSFUL)
+		{ // try again without forcing display frequency
+			dm.dmFields = DM_BITSPERPEL | DM_PELSWIDTH | DM_PELSHEIGHT;
+			res = ChangeDisplaySettings(&dm, CDS_FULLSCREEN);
+		}
+
+	}
+
+	//VideoDriver = video::createOpenGLDriver(CreationParams, FileSystem, this);
+	enableOpenGL();
+
+
+	const wchar_t* text= L"OpenCL rigid body demo";
+
+	DWORD dwResult;
+
+#ifdef _WIN64
+		SetWindowTextW(m_data->m_hWnd, text);
+#else
+		SendMessageTimeoutW(m_data->m_hWnd, WM_SETTEXT, 0,
+				reinterpret_cast<LPARAM>(text),
+				SMTO_ABORTIFHUNG, 2000, &dwResult);
+#endif
+	
+
+}
+
+
+void	Win32OpenGLWindow::switchFullScreen(bool fullscreen,int width,int height,int colorBitsPerPixel)
+{
+	LONG res;
+	DEVMODE dm;
+	memset(&dm, 0, sizeof(dm));
+	dm.dmSize = sizeof(dm);
+	// use default values from current setting
+	EnumDisplaySettings(NULL, ENUM_CURRENT_SETTINGS, &dm);
+
+	dm.dmFields = DM_BITSPERPEL | DM_PELSWIDTH | DM_PELSHEIGHT | DM_DISPLAYFREQUENCY;
+
+	if (fullscreen && !m_data->m_oldScreenWidth)
+	{
+		m_data->m_oldScreenWidth = dm.dmPelsWidth;
+		m_data->m_oldHeight = dm.dmPelsHeight;
+		m_data->m_oldBitsPerPel = dm.dmBitsPerPel;
+
+		if (width && height)
+		{
+			dm.dmPelsWidth = width;
+			dm.dmPelsHeight = height;
+		} else
+		{
+			dm.dmPelsWidth = m_data->m_width;
+			dm.dmPelsHeight = m_data->m_height;
+		}
+		if (colorBitsPerPixel)
+		{
+			dm.dmBitsPerPel = colorBitsPerPixel;
+		}
+	} else
+	{
+		if (m_data->m_oldScreenWidth)
+		{
+			dm.dmPelsWidth =	m_data->m_oldScreenWidth;
+			dm.dmPelsHeight=	m_data->m_oldHeight;
+			dm.dmBitsPerPel =   m_data->m_oldBitsPerPel;
+		}
+	}
+
+	if (fullscreen)
+	{
+		res = ChangeDisplaySettings(&dm, CDS_FULLSCREEN);
+	} else
+	{
+		res = ChangeDisplaySettings(&dm, 0);
+	}
+}
+
+
+
+Win32OpenGLWindow::Win32OpenGLWindow()
+{
+	m_data = new InternalData2();
+	sData = m_data;
+}
+
+Win32OpenGLWindow::~Win32OpenGLWindow()
+{
+	delete m_data;
+	sData = 0;
+}
+
+void	Win32OpenGLWindow::init()
+{
+	init(640,480,false);
+}
+
+
+void	Win32OpenGLWindow::exit()
+{
+	disableOpenGL();
+	DestroyWindow(this->m_data->m_hWnd);
+}
+
+
+
+
+
+void	Win32OpenGLWindow::startRendering()
+{
+		pumpMessage();
+
+		//glClearColor(1.f,0.f,0.f,1.f);
+		glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);	//clear buffers
+		
+		//glCullFace(GL_BACK);
+		//glFrontFace(GL_CCW);
+		glEnable(GL_DEPTH_TEST);
+
+
+		float aspect;
+		//btVector3 extents;
+
+		if (m_data->m_width > m_data->m_height) 
+		{
+			aspect = (float)m_data->m_width / (float)m_data->m_height;
+			//extents.setValue(aspect * 1.0f, 1.0f,0);
+		} else 
+		{
+			aspect = (float)m_data->m_height / (float)m_data->m_width;
+			//extents.setValue(1.0f, aspect*1.f,0);
+		}
+	
+		glMatrixMode(GL_PROJECTION);
+		glLoadIdentity();
+
+		if (m_data->m_width > m_data->m_height) 
+		{
+			glFrustum (-aspect, aspect, -1.0, 1.0, 1.0, 10000.0);
+		} else 
+		{
+			glFrustum (-1.0, 1.0, -aspect, aspect, 1.0, 10000.0);
+		}
+		glMatrixMode(GL_MODELVIEW);
+		glLoadIdentity();
+
+}
+
+
+void	Win32OpenGLWindow::renderAllObjects()
+{
+}
+
+void	Win32OpenGLWindow::endRendering()
+{
+	SwapBuffers( m_data->m_hDC );
+}
+
+float	Win32OpenGLWindow::getTimeInSeconds()
+{
+	return 0.f;
+}
+
+void	Win32OpenGLWindow::setDebugMessage(int x,int y,const char* message)
+{
+}
+
+bool Win32OpenGLWindow::requestedExit()
+{
+	return m_data->m_quit;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/Win32OpenGLRenderManager.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/Win32OpenGLRenderManager.h
@@ -0,0 +1,70 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+#ifndef _WIN32_OPENGL_RENDER_MANAGER_H
+#define _WIN32_OPENGL_RENDER_MANAGER_H
+
+
+#define RM_DECLARE_HANDLE(name) typedef struct name##__ { int unused; } *name
+
+RM_DECLARE_HANDLE(RenderObjectHandle);
+
+struct InternalData2;
+
+class Win32OpenGLWindow
+{
+	protected:
+		
+		struct InternalData2*	m_data;
+		
+		void enableOpenGL();
+		
+		void disableOpenGL();
+
+		void pumpMessage();
+	
+		
+
+public:
+
+	Win32OpenGLWindow();
+
+	virtual ~Win32OpenGLWindow();
+
+	virtual	void	init(); //default implementation uses default settings for width/height/fullscreen
+
+	void	init(int width,int height, bool fullscreen=false, int colorBitsPerPixel=0, void* windowHandle=0);
+	
+	void	switchFullScreen(bool fullscreen,int width=0,int height=0,int colorBitsPerPixel=0);
+
+	virtual	void	exit();
+
+
+	virtual	void	startRendering();
+
+	virtual	void	renderAllObjects();
+
+	virtual	void	endRendering();
+
+	virtual	float	getTimeInSeconds();
+
+	virtual void	setDebugMessage(int x,int y,const char* message);
+	
+	virtual bool requestedExit();
+
+};
+
+#endif //_WIN32_OPENGL_RENDER_MANAGER_H
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/main.cpp
@@ -0,0 +1,224 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+//
+//#include "vld.h"
+#include <GL/glew.h>
+
+#include "GLInstancingRenderer.h"
+
+
+#include "GLInstancingRenderer.h"
+#include "../opengl_interop/btOpenCLGLInteropBuffer.h"
+#include "Win32OpenGLRenderManager.h"
+#include "CLPhysicsDemo.h"
+#include "../broadphase_benchmark/btGridBroadphaseCl.h"
+#include "../../opencl/gpu_rigidbody_pipeline/btGpuNarrowPhaseAndSolver.h"
+#include "ShapeData.h"
+#include "LinearMath/btQuickprof.h"
+
+int NUM_OBJECTS_X = 32;
+int NUM_OBJECTS_Y = 24;
+int NUM_OBJECTS_Z = 32;
+
+
+float X_GAP = 2.f;
+float Y_GAP = 2.f;
+float Z_GAP = 2.f;
+
+extern int numPairsOut;
+
+
+void createScene(GLInstancingRenderer& renderer,CLPhysicsDemo& physicsSim)
+{
+	int strideInBytes = sizeof(float)*9;
+
+	int barrelShapeIndex = -1;
+	int cubeShapeIndex = -1;
+
+	float position[4]={0,0,0,0};
+	float orn[4] = {0,0,0,1};
+	float color[4] = {1,1,1,1};
+	int index=0;
+#if 1
+	{
+		int numVertices = sizeof(barrel_vertices)/strideInBytes;
+		int numIndices = sizeof(barrel_indices)/sizeof(int);
+		barrelShapeIndex = renderer.registerShape(&barrel_vertices[0],numVertices,barrel_indices,numIndices);
+	}
+
+
+	float barrelScaling[4] = {2,2,2,1};
+
+
+	int barrelCollisionShapeIndex = physicsSim.registerCollisionShape(&barrel_vertices[0],strideInBytes, sizeof(barrel_vertices)/strideInBytes,&barrelScaling[0]);
+	
+
+
+	for (int i=0;i<NUM_OBJECTS_X;i++)
+	{
+		for (int j=0;j<(NUM_OBJECTS_Y/2);j++)
+		{
+			for (int k=0;k<NUM_OBJECTS_Z;k++)
+			{
+				float mass = j? 1.f : 0.f;
+
+				position[0]=(i*X_GAP-NUM_OBJECTS_X/2)+5;
+				position[1]=(j*Y_GAP*2-NUM_OBJECTS_Y/2);
+				position[2]=(k*Z_GAP-NUM_OBJECTS_Z/2)-NUM_OBJECTS_Z*3;
+				position[3] = 1.f;
+				
+				renderer.registerGraphicsInstance(barrelShapeIndex,position,orn,color,barrelScaling);
+				void* ptr = (void*) index;
+				physicsSim.registerPhysicsInstance(mass,  position, orn, barrelCollisionShapeIndex,ptr);
+				
+				index++;
+			}
+		}
+	}
+#endif
+
+	float cubeScaling[4] = {2,2,2,1};
+	int cubeCollisionShapeIndex = physicsSim.registerCollisionShape(&cube_vertices[0],strideInBytes, sizeof(cube_vertices)/strideInBytes,&cubeScaling[0]);
+
+
+	{
+		int numVertices = sizeof(cube_vertices)/strideInBytes;
+		int numIndices = sizeof(cube_indices)/sizeof(int);
+		cubeShapeIndex = renderer.registerShape(&cube_vertices[0],numVertices,cube_indices,numIndices);
+	}
+
+	for (int i=0;i<NUM_OBJECTS_X;i++)
+	{
+		for (int j=0;j<NUM_OBJECTS_Y/2;j++)
+		{
+			for (int k=0;k<NUM_OBJECTS_Z;k++)
+			{
+				float mass = 1.f;//j? 1.f : 0.f;
+
+				position[0]=(i*X_GAP-NUM_OBJECTS_X/2)+(j&1);
+				position[1]=(j*Y_GAP-NUM_OBJECTS_Y/2);
+				position[2]=(k*Z_GAP-NUM_OBJECTS_Z/2)+(j&1);
+				position[3] = 1.f;
+				
+				renderer.registerGraphicsInstance(cubeShapeIndex,position,orn,color,cubeScaling);
+				void* ptr = (void*) index;
+				physicsSim.registerPhysicsInstance(mass,  position, orn, cubeCollisionShapeIndex,ptr);
+				
+				index++;
+			}
+		}
+	}
+
+	if (1)
+	{
+		//add some 'special' plane shape
+		void* ptr = (void*) index;
+		position[0] = 0.f;
+		position[1] = -NUM_OBJECTS_Y/2-1;
+		position[2] = 0.f;
+		position[3] = 1.f;
+
+		physicsSim.registerPhysicsInstance(0.f,position, orn, -1,ptr);
+		color[0] = 1.f;
+		color[1] = 0.f;
+		color[2] = 0.f;
+		cubeScaling[0] = 5000.f;
+		cubeScaling[1] = 0.01f;
+		cubeScaling[2] = 5000.f;
+
+		renderer.registerGraphicsInstance(cubeShapeIndex,position,orn,color,cubeScaling);
+	}
+	physicsSim.writeBodiesToGpu();
+
+
+}
+
+int main(int argc, char* argv[])
+{
+		
+	Win32OpenGLWindow* window = new Win32OpenGLWindow();
+		
+	window->init(1024,768);
+	GLenum err = glewInit();
+	window->startRendering();
+	window->endRendering();
+
+	GLInstancingRenderer render;
+
+	
+		
+
+
+	CLPhysicsDemo demo(window);
+	
+	bool useInterop = true;
+	demo.init(-1,-1,useInterop);
+
+		render.InitShaders();
+
+		if (useInterop)
+		demo.setupInterop();
+
+	createScene(render, demo);
+		
+
+	printf("num objects = %d\n", NUM_OBJECTS_X*NUM_OBJECTS_Y*NUM_OBJECTS_Z);
+
+
+	render.writeTransforms();
+
+
+	while (!window->requestedExit())
+	{
+		CProfileManager::Reset();
+		
+		demo.stepSimulation();
+
+
+		window->startRendering();
+		render.RenderScene();
+		window->endRendering();
+
+		CProfileManager::Increment_Frame_Counter();
+
+		static bool printStats  = true;
+
+		 if (printStats)
+		 {
+			static int count = 10;
+			count--;
+			if (count<0)
+			{
+				CProfileManager::dumpAll();
+				//printf("total broadphase pairs= %d\n", gFpIO.m_numOverlap);
+				printf("numPairsOut (culled)  = %d\n", numPairsOut);
+				printStats  = false;
+			}
+		 }
+
+	}
+
+	
+	demo.cleanup();
+
+	render.CleanupShaders();
+	window->exit();
+	delete window;
+	
+	
+	
+	return 0;
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/gpu_rigidbody_pipeline2/premake4.lua
@@ -0,0 +1,5 @@
+
+include "AMD"
+--	include "Intel"
+-- include "NVIDIA"
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/integration/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/integration/AMD/premake4.lua
@@ -0,0 +1,34 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_integration_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives"
+		}
+		
+		files {
+			"../main.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/integration/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/integration/Intel/premake4.lua
@@ -0,0 +1,36 @@
+	
+	hasCL = findOpenCL_Intel()
+	
+	if (hasCL) then
+
+		project "OpenCL_integration_Intel"
+
+		initOpenCL_Intel()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives"
+		}
+		
+		files {
+			"../main.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/integration/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/integration/NVIDIA/premake4.lua
@@ -0,0 +1,35 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_integration_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+
+			includedirs {
+		"../../../rendering/BulletMath",
+		"../../primitives"
+		}
+		
+		files {
+			"../main.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.cpp",
+			"../../opengl_interop/btOpenCLGLInteropBuffer.h",
+			"../../opengl_interop/btStopwatch.cpp",
+			"../../opengl_interop/btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/integration/integrateKernel.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/integration/integrateKernel.cl
@@ -0,0 +1,73 @@
+MSTRINGIFY(
+
+float4 quatMult(float4 q1, float4 q2)
+{
+	float4 q;
+	q.x = q1.w * q2.x + q1.x * q2.w + q1.y * q2.z - q1.z * q2.y;
+	q.y = q1.w * q2.y + q1.y * q2.w + q1.z * q2.x - q1.x * q2.z;
+	q.z = q1.w * q2.z + q1.z * q2.w + q1.x * q2.y - q1.y * q2.x;
+	q.w = q1.w * q2.w - q1.x * q2.x - q1.y * q2.y - q1.z * q2.z; 
+	return q;
+}
+
+float4 quatNorm(float4 q)
+{
+	float len = native_sqrt(dot(q, q));
+	if(len > 0.f)
+	{
+		q *= 1.f / len;
+	}
+	else
+	{
+		q.x = q.y = q.z = 0.f;
+		q.w = 1.f;
+	}
+	return q;
+}
+
+
+
+__kernel void 
+  interopKernel( const int startOffset, const int numNodes, __global float4 *g_vertexBuffer,
+		   __global float4 *linVel,
+		   __global float4 *pAngVel)
+{
+	int nodeID = get_global_id(0);
+	float timeStep = 0.0166666;
+	
+	float BT_GPU_ANGULAR_MOTION_THRESHOLD = (0.25f * 3.14159254);
+	
+	if( nodeID < numNodes )
+	{
+		g_vertexBuffer[nodeID + startOffset/4] += linVel[nodeID]*timeStep;
+		
+		//		g_vertexBuffer[nodeID + startOffset/4+numNodes] += angVel[nodeID];
+
+		float4 axis;
+		float4 angvel = pAngVel[nodeID];
+		float fAngle = native_sqrt(dot(angvel, angvel));
+		//limit the angular motion
+		if(fAngle*timeStep > BT_GPU_ANGULAR_MOTION_THRESHOLD)
+		{
+			fAngle = BT_GPU_ANGULAR_MOTION_THRESHOLD / timeStep;
+		}
+		if(fAngle < 0.001f)
+		{
+			// use Taylor's expansions of sync function
+			axis = angvel * (0.5f*timeStep-(timeStep*timeStep*timeStep)*0.020833333333f * fAngle * fAngle);
+		}
+		else
+		{
+			// sync(fAngle) = sin(c*fAngle)/t
+			axis = angvel * ( native_sin(0.5f * fAngle * timeStep) / fAngle);
+		}
+		float4 dorn = axis;
+		dorn.w = native_cos(fAngle * timeStep * 0.5f);
+		float4 orn0 = g_vertexBuffer[nodeID + startOffset/4+numNodes];
+		float4 predictedOrn = quatMult(dorn, orn0);
+		predictedOrn = quatNorm(predictedOrn);
+		g_vertexBuffer[nodeID + startOffset/4+numNodes]=predictedOrn;
+	}
+}
+
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/integration/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/integration/main.cpp
--- a/Extras/RigidBodyGpuPipeline/opencl/integration/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/integration/premake4.lua
@@ -0,0 +1,5 @@
+
+	include "AMD"
+	include "Intel"
+	include "NVIDIA"
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/AMD/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/AMD/premake4.lua
@@ -0,0 +1,33 @@
+	
+	hasCL = findOpenCL_AMD()
+	
+	if (hasCL) then
+
+		project "OpenCL_GL_interop_AMD"
+
+		initOpenCL_AMD()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+		includedirs {
+			"../../../rendering/BulletMath"
+		}
+		
+		files {
+			"../main.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../btOpenCLGLInteropBuffer.cpp",
+			"../btOpenCLGLInteropBuffer.h",
+			"../btStopwatch.cpp",
+			"../btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/Intel/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/Intel/premake4.lua
@@ -0,0 +1,34 @@
+	
+	hasCL = findOpenCL_Intel()
+	
+	if (hasCL) then
+
+		project "OpenCL_GL_interop_Intel"
+
+		initOpenCL_Intel()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+		includedirs {
+
+		"../../../rendering/BulletMath"
+		}
+		
+		files {
+			"../main.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../btOpenCLGLInteropBuffer.cpp",
+			"../btOpenCLGLInteropBuffer.h",
+			"../btStopwatch.cpp",
+			"../btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/NVIDIA/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/NVIDIA/premake4.lua
@@ -0,0 +1,34 @@
+	
+	hasCL = findOpenCL_NVIDIA()
+	
+	if (hasCL) then
+
+		project "OpenCL_GL_interop_NVIDIA"
+
+		initOpenCL_NVIDIA()
+	
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+
+		initOpenGL()
+		initGlut()
+		initGlew()
+
+	
+		includedirs {
+			"../../../rendering/BulletMath"
+		}
+		
+		files {
+			"../main.cpp",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../btOpenCLGLInteropBuffer.cpp",
+			"../btOpenCLGLInteropBuffer.h",
+			"../btStopwatch.cpp",
+			"../btStopwatch.h"
+		}
+		
+	end
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btOpenCLGLInteropBuffer.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btOpenCLGLInteropBuffer.cpp
@@ -0,0 +1,60 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#include "btOpenCLGLInteropBuffer.h"
+	
+btOpenCLGLInteropBuffer::btOpenCLGLInteropBuffer(cl_context	clContext, cl_command_queue	commandQueue,GLuint openGLVBO)
+:m_clContext(clContext),
+m_commandQueue(commandQueue),
+m_openGLVBO(openGLVBO)
+{
+	cl_int ciErrNum = CL_SUCCESS;
+//	m_buffer = clCreateFromGLBuffer(m_clContext, CL_MEM_WRITE_ONLY, m_openGLVBO, &ciErrNum);
+	m_buffer = clCreateFromGLBuffer(m_clContext, CL_MEM_READ_WRITE, m_openGLVBO, &ciErrNum);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+}
+
+btOpenCLGLInteropBuffer::~btOpenCLGLInteropBuffer()
+{
+	cl_int ciErrNum = CL_SUCCESS;
+	clReleaseMemObject (m_buffer);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+void	btOpenCLGLInteropBuffer::copyCL2GL()
+{
+	cl_int ciErrNum = CL_SUCCESS;
+	ciErrNum = clEnqueueAcquireGLObjects(m_commandQueue, 1, &m_buffer, 0, 0, NULL);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+
+	//do some stuff
+
+
+
+
+	ciErrNum = clEnqueueReleaseGLObjects(m_commandQueue, 1, &m_buffer, 0, 0, 0);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	//only wait if necessary
+//	clFinish(m_commandQueue);
+
+}
+
+void	btOpenCLGLInteropBuffer::copyGL2CL()
+{
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btOpenCLGLInteropBuffer.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btOpenCLGLInteropBuffer.h
@@ -0,0 +1,49 @@
+/*
+Bullet Continuous Collision Detection and Physics Library
+Copyright (c) 2011 Advanced Micro Devices, Inc.  http://bulletphysics.org
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///original author: Erwin Coumans
+
+#ifndef BT_OPENCL_GL_INTEROP_BUFFER_H
+#define BT_OPENCL_GL_INTEROP_BUFFER_H
+
+#include "btGlutInclude.h"
+
+#include "../basic_initialize/btOpenCLInclude.h"
+
+class btOpenCLGLInteropBuffer
+{
+
+	cl_context	m_clContext;
+	cl_command_queue	m_commandQueue;
+	cl_mem	m_buffer;
+	GLuint m_openGLVBO;
+
+public:
+	
+	btOpenCLGLInteropBuffer(cl_context	clContext, cl_command_queue	commandQueue,GLuint openGLVBO);
+	virtual ~btOpenCLGLInteropBuffer();
+
+	void	copyCL2GL();
+
+	void	copyGL2CL();
+
+	cl_mem	getCLBUffer()
+	{
+		return m_buffer;
+	}
+};
+
+#endif //BT_OPENCL_GL_INTEROP_BUFFER_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btStopwatch.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btStopwatch.cpp
@@ -0,0 +1,182 @@
+/*
+Stopwatch for timing and profiling for the Bullet Physics Library, http://bulletphysics.org
+Copyright (c) 2003-2011 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#include "btStopwatch.h"
+
+
+#ifdef __CELLOS_LV2__
+#include <sys/sys_time.h>
+#include <sys/time_util.h>
+#include <stdio.h>
+#endif
+
+#if defined (SUNOS) || defined (__SUNOS__) 
+#include <stdio.h> 
+#endif
+
+#if defined(WIN32) || defined(_WIN32)
+
+#define BT_USE_WINDOWS_TIMERS
+#define WIN32_LEAN_AND_MEAN
+#define NOWINRES
+#define NOMCX
+#define NOIME 
+
+#ifdef _XBOX
+	#include <Xtl.h>
+#else //_XBOX
+	#include <windows.h>
+#endif //_XBOX
+
+#include <time.h>
+
+
+#else //_WIN32
+#include <sys/time.h>
+#endif //_WIN32
+
+#define mymin(a,b) (a > b ? a : b)
+
+struct btStopwatchData
+{
+
+#ifdef BT_USE_WINDOWS_TIMERS
+	LARGE_INTEGER mClockFrequency;
+	DWORD mStartTick;
+	LONGLONG mPrevElapsedTime;
+	LARGE_INTEGER mStartTime;
+#else
+#ifdef __CELLOS_LV2__
+	uint64_t	mStartTime;
+#else
+	struct timeval mStartTime;
+#endif
+#endif //__CELLOS_LV2__
+
+};
+
+
+btStopwatch::btStopwatch()
+{
+	m_data = new btStopwatchData;
+#ifdef BT_USE_WINDOWS_TIMERS
+	QueryPerformanceFrequency(&m_data->mClockFrequency);
+#endif
+	reset();
+}
+
+btStopwatch::~btStopwatch()
+{
+	delete m_data;
+}
+
+btStopwatch::btStopwatch(const btStopwatch& other)
+{
+	m_data = new btStopwatchData;
+	*m_data = *other.m_data;
+}
+
+btStopwatch& btStopwatch::operator=(const btStopwatch& other)
+{
+	*m_data = *other.m_data;
+	return *this;
+}
+
+
+	/// Resets the initial reference time.
+void btStopwatch::reset()
+{
+#ifdef BT_USE_WINDOWS_TIMERS
+	QueryPerformanceCounter(&m_data->mStartTime);
+	m_data->mStartTick = GetTickCount();
+	m_data->mPrevElapsedTime = 0;
+#else
+#ifdef __CELLOS_LV2__
+
+	typedef uint64_t  ClockSize;
+	ClockSize newTime;
+	//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
+	SYS_TIMEBASE_GET( newTime );
+	m_data->mStartTime = newTime;
+#else
+	gettimeofday(&m_data->mStartTime, 0);
+#endif
+#endif
+}
+
+/// Returns the time in ms since the last call to reset or since 
+/// the btStopwatch was created.
+float btStopwatch::getTimeMilliseconds()
+{
+	return getTimeMicroseconds()/1000.f;
+}
+
+	/// Returns the time in us since the last call to reset or since 
+	/// the stopwatch was created.
+unsigned long int btStopwatch::getTimeMicroseconds()
+{
+#ifdef BT_USE_WINDOWS_TIMERS
+		LARGE_INTEGER currentTime;
+		QueryPerformanceCounter(&currentTime);
+		LONGLONG elapsedTime = currentTime.QuadPart - m_data->mStartTime.QuadPart;
+
+		// Compute the number of millisecond ticks elapsed.
+		unsigned long msecTicks = (unsigned long)(1000 * elapsedTime / m_data->mClockFrequency.QuadPart);
+
+		// Check for unexpected leaps in the Win32 performance counter.  
+		// (This is caused by unexpected data across the PCI to ISA 
+		// bridge, aka south bridge.  See Microsoft KB274323.)
+		unsigned long elapsedTicks = GetTickCount() - m_data->mStartTick;
+		signed long msecOff = (signed long)(msecTicks - elapsedTicks);
+		if (msecOff < -100 || msecOff > 100)
+		{
+			// Adjust the starting time forwards.
+			LONGLONG msecAdjustment = mymin(msecOff * 
+				m_data->mClockFrequency.QuadPart / 1000, elapsedTime - 
+				m_data->mPrevElapsedTime);
+			m_data->mStartTime.QuadPart += msecAdjustment;
+			elapsedTime -= msecAdjustment;
+		}
+
+		// Store the current elapsed time for adjustments next time.
+		m_data->mPrevElapsedTime = elapsedTime;
+
+		// Convert to microseconds.
+		unsigned long usecTicks = (unsigned long)(1000000 * elapsedTime / 
+			m_data->mClockFrequency.QuadPart);
+
+		return usecTicks;
+#else
+
+#ifdef __CELLOS_LV2__
+		uint64_t freq=sys_time_get_timebase_frequency();
+		double dFreq=((double) freq)/ 1000000.0;
+		typedef uint64_t  ClockSize;
+		ClockSize newTime;
+		//__asm __volatile__( "mftb %0" : "=r" (newTime) : : "memory");
+		SYS_TIMEBASE_GET( newTime );
+
+		return (unsigned long int)((double(newTime-m_data->mStartTime)) / dFreq);
+#else
+
+		struct timeval currentTime;
+		gettimeofday(&currentTime, 0);
+		return (currentTime.tv_sec - m_data->mStartTime.tv_sec) * 1000000 + (currentTime.tv_usec - m_data->mStartTime.tv_usec);
+#endif//__CELLOS_LV2__
+#endif 
+}
+
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btStopwatch.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/btStopwatch.h
@@ -0,0 +1,45 @@
+/*
+Stopwatch for timing and profiling for the Bullet Physics Library, http://bulletphysics.org
+Copyright (c) 2003-2011 Erwin Coumans
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef BT_STOPWATCH_H
+#define BT_STOPWATCH_H
+
+///The btStopwatch is a portable basic clock that measures real-time, use for profiling etc.
+class btStopwatch
+{
+public:
+	btStopwatch();
+
+	btStopwatch(const btStopwatch& other);
+	btStopwatch& operator=(const btStopwatch& other);
+
+	~btStopwatch();
+
+	/// Resets the initial reference time.
+	void reset();
+
+	/// Returns the time in ms since the last call to reset or since 
+	/// the btStopwatch was created.
+	float getTimeMilliseconds();
+
+	/// Returns the time in us since the last call to reset or since 
+	/// the Clock was created.
+	unsigned long int getTimeMicroseconds();
+private:
+	struct btStopwatchData* m_data;
+};
+
+
+#endif //BT_STOPWATCH_H
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/interopKernel.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/interopKernel.cl
@@ -0,0 +1,13 @@
+MSTRINGIFY(
+
+__kernel void 
+interopKernel( const int startOffset, const int numNodes, __global float *g_vertexBuffer)
+{
+	int nodeID = get_global_id(0);
+	if( nodeID < numNodes )
+	{
+		g_vertexBuffer[nodeID*4 + startOffset+1] += 0.01;
+	}
+}
+
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/main.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/main.cpp
--- a/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/premake4.lua
+++ b/Extras/RigidBodyGpuPipeline/opencl/opengl_interop/premake4.lua
@@ -0,0 +1,5 @@
+
+	include "AMD"
+	include "Intel"
+--	include "NVIDIA"
+	
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.cpp
@@ -0,0 +1,19 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include <Adl/Adl.h>
+
+//KernelManager* KernelManager::s_kManager = NULL;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.h
@@ -0,0 +1,235 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_H
+#define ADL_H
+
+#pragma warning( disable : 4996 )
+#include <Adl/AdlConfig.h>
+#include <Adl/AdlError.h>
+#include <algorithm>
+
+#ifndef max
+#define max(a,b)            (((a) > (b)) ? (a) : (b))
+#endif
+
+#ifndef min
+#define min(a,b)            (((a) < (b)) ? (a) : (b))
+#endif
+
+namespace adl
+{
+
+enum DeviceType
+{
+	TYPE_CL = 0,
+	TYPE_DX11 = 1,
+	TYPE_HOST,
+};
+
+
+struct Device;
+
+struct BufferBase
+{
+	enum BufferType
+	{
+		BUFFER,
+
+		//	for dx
+		BUFFER_CONST,
+		BUFFER_STAGING,
+		BUFFER_APPEND,
+		BUFFER_RAW,
+		BUFFER_W_COUNTER,
+		BUFFER_INDEX,
+		BUFFER_VERTEX,
+
+		//	for cl
+		BUFFER_ZERO_COPY,
+
+	};
+};
+
+class DeviceUtils
+{
+	public:
+		struct Config
+		{
+			enum DeviceType
+			{
+				DEVICE_GPU,
+				DEVICE_CPU,
+			};
+
+			//	for CL
+			enum DeviceVendor
+			{
+				VD_AMD,
+				VD_INTEL,
+				VD_NV,
+			};
+
+			Config() : m_type(DEVICE_GPU), m_deviceIdx(0), m_vendor(VD_AMD){}
+
+			DeviceType m_type;
+			int m_deviceIdx;
+			DeviceVendor m_vendor;
+		};
+
+		__inline
+		static
+		int getNDevices( DeviceType type );
+		__inline
+		static Device* allocate( DeviceType type, Config& cfg );
+		__inline
+		static void deallocate( Device* deviceData );
+		__inline
+		static void waitForCompletion( const Device* deviceData );
+};
+
+//==========================
+//	DeviceData
+//==========================
+struct Kernel;
+
+struct Device
+{
+	typedef DeviceUtils::Config Config;
+
+	Device( DeviceType type ) : m_type( type ), m_memoryUsage(0)
+	{
+	}
+
+	virtual void* getContext() const { return 0; }
+	virtual void initialize(const Config& cfg){}
+	virtual void release(){}
+	virtual void waitForCompletion() const {}
+	virtual void getDeviceName( char nameOut[128] ) const {}
+	virtual Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true ) const { ADLASSERT(0); return 0;}
+	virtual unsigned int getUsedMemory() const { return m_memoryUsage; }
+
+	DeviceType m_type;
+	unsigned int m_memoryUsage;
+};
+
+//==========================
+//	Buffer
+//==========================
+
+template<typename T>
+struct HostBuffer;
+//	overload each deviceDatas
+template<typename T>
+struct Buffer : public BufferBase
+{
+	__inline
+	Buffer();
+	__inline
+	Buffer(const Device* device, int nElems, BufferType type = BUFFER );
+	__inline
+	virtual ~Buffer();
+	
+	__inline
+	void setRawPtr( const Device* device, T* ptr, int size, BufferType type = BUFFER );
+	__inline
+	void allocate(const Device* device, int nElems, BufferType type = BUFFER );
+	__inline
+	void write(T* hostSrcPtr, int nElems, int dstOffsetNElems = 0);
+	__inline
+	void read(T* hostDstPtr, int nElems, int srcOffsetNElems = 0) const;
+	__inline
+	void write(Buffer<T>& src, int nElems);
+	__inline
+	void read(Buffer<T>& dst, int nElems) const;
+//	__inline
+//	Buffer<T>& operator = (const Buffer<T>& buffer);
+	__inline
+	int getSize() const { return m_size; }
+
+	DeviceType getType() const { ADLASSERT( m_device ); return m_device->m_type; }
+
+
+	const Device* m_device;
+	int m_size;
+	T* m_ptr;
+	//	for DX11
+	void* m_uav;
+	void* m_srv;
+	bool m_allocated;	//	todo. move this to a bit
+};
+
+class BufferUtils
+{
+public:
+	template<DeviceType TYPE, bool COPY, typename T>
+	__inline
+	static
+	typename Buffer<T>* map(const Device* device, const Buffer<T>* in, int copySize = -1);
+
+	template<bool COPY, typename T>
+	__inline
+	static
+	void unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize = -1 );
+};
+
+//==========================
+//	HostBuffer
+//==========================
+struct DeviceHost;
+
+template<typename T>
+struct HostBuffer : public Buffer<T>
+{
+	__inline
+	HostBuffer():Buffer<T>(){}
+	__inline
+	HostBuffer(const Device* device, int nElems, BufferType type = BUFFER ) : Buffer<T>(device, nElems, type) {}
+//	HostBuffer(const Device* deviceData, T* rawPtr, int nElems);
+
+
+	__inline
+	T& operator[](int idx);
+	__inline
+	const T& operator[](int idx) const;
+	__inline
+	T* begin() { return m_ptr; }
+
+	__inline
+	HostBuffer<T>& operator = (const Buffer<T>& device);
+};
+
+};
+
+#include <Adl/AdlKernel.h>
+#if defined(ADL_ENABLE_CL)
+	#include <Adl/CL/AdlCL.inl>
+#endif
+#if defined(ADL_ENABLE_DX11)
+	#include <Adl/DX11/AdlDX11.inl>
+#endif
+
+#include <Adl/Host/AdlHost.inl>
+#include <Adl/AdlKernel.inl>
+#include <Adl/Adl.inl>
+
+
+#include <Adl/AdlStopwatch.h>
+
+#include <Adl/Host/AdlStopwatchHost.inl>
+#include <Adl/AdlStopwatch.inl>
+
+#endif
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Adl.inl
@@ -0,0 +1,344 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+namespace adl
+{
+
+int DeviceUtils::getNDevices( DeviceType type )
+{
+	switch( type )
+	{
+#if defined(ADL_ENABLE_CL)
+	case TYPE_CL:
+		return DeviceCL::getNDevices();
+#endif
+#if defined(ADL_ENABLE_DX11)
+	case TYPE_DX11:
+		return DeviceDX11::getNDevices();
+#endif
+	default:
+		return 1;
+	};
+}
+
+Device* DeviceUtils::allocate( DeviceType type, Config& cfg )
+{
+	Device* deviceData;
+	switch( type )
+	{
+#if defined(ADL_ENABLE_CL)
+	case TYPE_CL:
+		deviceData = new DeviceCL();
+		break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+	case TYPE_DX11:
+		deviceData = new DeviceDX11();
+		break;
+#endif
+	case TYPE_HOST:
+		deviceData = new DeviceHost();
+		break;
+	default:
+		ADLASSERT( 0 );
+		break;
+	};
+	deviceData->initialize( cfg );
+	return deviceData;
+}
+
+void DeviceUtils::deallocate( Device* deviceData )
+{
+	ADLASSERT( deviceData->getUsedMemory() == 0 );
+	deviceData->release();
+	delete deviceData;
+}
+
+void DeviceUtils::waitForCompletion( const Device* deviceData )
+{
+	deviceData->waitForCompletion();
+}
+
+#if defined(ADL_ENABLE_DX11)
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
+		case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
+		case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#else
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_DX11: ((DeviceDX11*)m_device)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_DX11: ((DeviceDX11*)deviceData)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#endif
+#else
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)m_device)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_CL: ((DeviceCL*)deviceData)->func; break; \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#else
+	#define SELECT_DEVICEDATA( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_HOST: ((DeviceHost*)m_device)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+
+	#define SELECT_DEVICEDATA1( deviceData, func ) \
+		switch( deviceData->m_type ) \
+		{ \
+		case TYPE_HOST: ((DeviceHost*)deviceData)->func; break; \
+		default: ADLASSERT(0); break; \
+		}
+	#endif
+#endif
+
+template<typename T>
+Buffer<T>::Buffer()
+{
+	m_device = 0;
+	m_size = 0;
+	m_ptr = 0;
+
+	m_uav = 0;
+	m_srv = 0;
+
+	m_allocated = false;
+}
+
+template<typename T>
+Buffer<T>::Buffer(const Device* deviceData, int nElems, BufferType type )
+{
+	m_device = 0;
+	allocate( deviceData, nElems, type );
+}
+
+template<typename T>
+Buffer<T>::~Buffer()
+{
+	if( m_allocated )
+	{
+		if( m_device )
+			SELECT_DEVICEDATA( m_device->m_type, deallocate( this ) );
+	}
+
+	m_device = 0;
+	m_ptr = 0;
+	m_size = 0;
+}
+
+template<typename T>
+void Buffer<T>::setRawPtr( const Device* device, T* ptr, int size, BufferType type )
+{
+	ADLASSERT( m_device == 0 );
+	ADLASSERT( type == BUFFER );	//	todo. implement
+	ADLASSERT( device->m_type != TYPE_DX11 );	//	todo. implement set srv, uav
+
+	m_device = device;
+	m_ptr = ptr;
+	m_size = size;
+}
+
+template<typename T>
+void Buffer<T>::allocate(const Device* deviceData, int nElems, BufferType type )
+{
+	ADLASSERT( m_device == 0 );
+	m_device = deviceData;
+	m_size = 0;
+	m_ptr = 0;
+
+	m_uav = 0;
+	m_srv = 0;
+
+	SELECT_DEVICEDATA( m_device->m_type, allocate( this, nElems, type ) );
+	m_allocated = true;
+}
+
+template<typename T>
+void Buffer<T>::write(T* hostPtr, int nElems, int offsetNElems)
+{
+	ADLASSERT( nElems+offsetNElems <= m_size );
+	SELECT_DEVICEDATA( m_device->m_type, copy(this, hostPtr, nElems, offsetNElems) );
+}
+
+template<typename T>
+void Buffer<T>::read(T* hostPtr, int nElems, int offsetNElems) const
+{
+	SELECT_DEVICEDATA( m_device->m_type, copy(hostPtr,this, nElems, offsetNElems) );
+}
+
+template<typename T>
+void Buffer<T>::write(Buffer<T>& src, int nElems)
+{
+	ADLASSERT( nElems <= m_size );
+	SELECT_DEVICEDATA( m_device->m_type, copy(this, &src, nElems) );
+}
+
+template<typename T>
+void Buffer<T>::read(Buffer<T>& dst, int nElems) const
+{
+	SELECT_DEVICEDATA( m_device->m_type, copy(&dst, this, nElems) );
+}
+/*
+template<typename T>
+Buffer<T>& Buffer<T>::operator = ( const Buffer<T>& buffer )
+{
+//	ADLASSERT( buffer.m_size <= m_size );
+
+	SELECT_DEVICEDATA( m_device->m_type, copy(this, &buffer, min2( m_size, buffer.m_size) ) );
+
+	return *this;
+}
+*/
+
+template<DeviceType TYPE, bool COPY, typename T>
+__inline
+static
+typename Buffer<T>* BufferUtils::map(const Device* device, const Buffer<T>* in, int copySize)
+{
+	Buffer<T>* native;
+	ADLASSERT( device->m_type == TYPE );
+
+	if( in->getType() == TYPE )
+		native = (Buffer<T>*)in;
+	else
+	{
+		ADLASSERT( copySize <= in->getSize() );
+		copySize = (copySize==-1)? in->getSize() : copySize;
+
+		native = new Buffer<T>( device, copySize );
+		if( COPY )
+		{
+			if( in->getType() == TYPE_HOST )
+				native->write( in->m_ptr, copySize );
+			else if( native->getType() == TYPE_HOST )
+			{
+				in->read( native->m_ptr, copySize );
+				DeviceUtils::waitForCompletion( in->m_device );
+			}
+			else
+			{
+				T* tmp = new T[copySize];
+				in->read( tmp, copySize );
+				DeviceUtils::waitForCompletion( in->m_device );
+				native->write( tmp, copySize );
+				DeviceUtils::waitForCompletion( native->m_device );
+				delete [] tmp;
+			}
+		}
+	}
+	return native;
+}
+
+template<bool COPY, typename T>
+__inline
+static
+void BufferUtils::unmap( Buffer<T>* native, const Buffer<T>* orig, int copySize )
+{
+	if( native != orig )
+	{
+		if( COPY ) 
+		{
+			copySize = (copySize==-1)? orig->getSize() : copySize;
+			ADLASSERT( copySize <= orig->getSize() );
+			if( orig->getType() == TYPE_HOST )
+			{
+				native->read( orig->m_ptr, copySize );
+				DeviceUtils::waitForCompletion( native->m_device );
+			}
+			else if( native->getType() == TYPE_HOST )
+			{
+				Buffer<T>* dst = (Buffer<T>*)orig;
+				dst->write( native->m_ptr, copySize );
+				DeviceUtils::waitForCompletion( dst->m_device );
+			}
+			else
+			{
+				T* tmp = new T[copySize];
+				native->read( tmp, copySize );
+				DeviceUtils::waitForCompletion( native->m_device );
+				Buffer<T>* dst = (Buffer<T>*)orig;
+				dst->write( tmp, copySize );
+				DeviceUtils::waitForCompletion( dst->m_device );
+				delete [] tmp;
+			}
+		}
+		delete native;
+	}
+}
+
+
+template<typename T>
+T& HostBuffer<T>::operator[](int idx)
+{
+	return m_ptr[idx];
+}
+
+template<typename T>
+const T& HostBuffer<T>::operator[](int idx) const
+{
+	return m_ptr[idx];
+}
+
+template<typename T>
+HostBuffer<T>& HostBuffer<T>::operator = ( const Buffer<T>& device )
+{
+	ADLASSERT( device.m_size <= m_size );
+
+	SELECT_DEVICEDATA1( device.m_device, copy( m_ptr, &device, device.m_size ) );
+
+	return *this;
+}
+
+#undef SELECT_DEVICEDATA
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlConfig.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlConfig.h
@@ -0,0 +1,27 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+//ADL_ENABLE_CL and ADL_ENABLE_DX11 can be set in the build system using C/C++ preprocessor defines
+//#define ADL_ENABLE_CL
+//#define ADL_ENABLE_DX11
+
+//#define ADL_CL_FORCE_UNCACHE_KERNEL
+#define ADL_CL_DUMP_MEMORY_LOG
+
+//load the kernels from string instead of loading them from file
+#define ADL_LOAD_KERNEL_FROM_STRING
+#define ADL_DUMP_DX11_ERROR
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlError.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlError.h
@@ -0,0 +1,80 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_ERROR_H
+#define ADL_ERROR_H
+
+#if defined(ADL_DUMP_DX11_ERROR)
+	#include <windows.h>
+#endif
+#ifdef _DEBUG
+	#include <assert.h>
+	#include <stdarg.h>
+	#include <stdio.h>
+#endif
+
+
+namespace adl
+{
+
+#ifdef _DEBUG
+	#define ADLASSERT(x) if(!(x)){__debugbreak(); }
+#else
+	#define ADLASSERT(x) if(x){}
+#endif
+
+#ifdef _DEBUG
+	#define COMPILE_TIME_ASSERT(x) {int compileTimeAssertFailed[x]; compileTimeAssertFailed[0];}
+#else
+	#define COMPILE_TIME_ASSERT(x)
+#endif
+
+#ifdef _DEBUG
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+		va_list arg;
+		va_start(arg, fmt);
+#if defined(ADL_DUMP_DX11_ERROR)
+		const int size = 1024*10;
+		char buf[size];
+		vsprintf_s( buf, size, fmt, arg );
+#ifdef UNICODE
+		WCHAR wbuf[size];
+		int sizeWide = MultiByteToWideChar(0,0,buf,-1,wbuf,0);
+		MultiByteToWideChar(0,0,buf,-1,wbuf,sizeWide);
+
+//		swprintf_s( wbuf, 256, L"%s", buf );
+		OutputDebugString( wbuf );
+#else
+		OutputDebugString( buf );
+#endif
+#else
+		vprintf(fmt, arg);
+#endif
+		va_end(arg);
+	}
+#else
+	__inline
+	void debugPrintf(const char *fmt, ...)
+	{
+	}
+#endif
+
+};
+
+#endif
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.h
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#ifndef ADL_KERNEL_H
+#define ADL_KERNEL_H
+
+#include <map>
+#include <string>
+#include <fstream>
+
+namespace adl
+{
+
+//==========================
+//	Kernel
+//==========================
+struct Kernel
+{
+	DeviceType m_type;
+	void* m_kernel;
+};
+
+//==========================
+//	KernelManager
+//==========================
+class KernelManager
+{
+	public:
+		typedef std::map<std::string, Kernel*> KMap;
+
+		__inline
+		~KernelManager();
+
+		__inline
+//		static
+		Kernel* query(const Device* dd, const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL,
+			bool cacheKernel = true);
+
+	public:
+		KMap m_map;
+};
+
+//==========================
+//	Launcher
+//==========================
+class Launcher
+{
+	public:
+		struct BufferInfo
+		{
+			BufferInfo(){}
+			template<typename T>
+			BufferInfo(Buffer<T>* buff, bool isReadOnly = false): m_buffer(buff), m_isReadOnly(isReadOnly){}
+
+			void* m_buffer;
+			bool m_isReadOnly;
+		};
+
+		__inline
+		Launcher(const Device* dd, char* fileName, char* funcName, char* option = NULL);
+		__inline
+		Launcher(const Device* dd, Kernel* kernel);
+		__inline
+		void setBuffers( BufferInfo* buffInfo, int n );
+		template<typename T>
+		__inline
+		void setConst( Buffer<T>& constBuff, const T& consts );
+		__inline
+		void launch1D( int numThreads, int localSize = 64 );
+		__inline
+		void launch2D( int numThreadsX, int numThreadsY, int localSizeX = 8, int localSizeY = 8 );
+
+	public:
+		enum
+		{
+			CONST_BUFFER_SIZE = 512,
+		};
+
+		const Device* m_deviceData;
+		Kernel* m_kernel;
+		int m_idx;
+		int m_idxRw;
+};
+
+template<DeviceType TYPE>
+class KernelBuilder
+{
+	public:
+
+		__inline
+		KernelBuilder(): m_ptr(0){}
+		
+		__inline
+		void setFromFile( const Device* deviceData, const char* fileName, const char* option = NULL, bool addExtension = false,
+			bool cacheKernel = true);
+
+		__inline
+		void setFromSrc( const Device* deviceData, const char* src, const char* option = NULL );
+
+		__inline
+		void setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option );
+
+
+		__inline
+		void createKernel( const char* funcName, Kernel& kernelOut );
+
+		__inline
+		~KernelBuilder();
+		//	todo. implemement in kernel destructor?
+		__inline
+		static void deleteKernel( Kernel& kernel );
+
+	private:
+		enum
+		{
+			MAX_PATH_LENGTH = 260,
+		};
+		const Device* m_deviceData;
+#ifdef UNICODE
+		wchar_t m_path[MAX_PATH_LENGTH];
+#else
+		char m_path[MAX_PATH_LENGTH];
+#endif
+		void* m_ptr;
+};
+
+};
+
+#endif //ADL_KERNEL_H
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlKernel.inl
@@ -0,0 +1,223 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#ifdef ADL_ENABLE_CL
+	#include <Adl/CL/AdlKernelUtilsCL.inl>
+#endif
+#ifdef ADL_ENABLE_DX11
+	#include <Adl/DX11/AdlKernelUtilsDX11.inl>
+#endif
+
+namespace adl
+{
+
+//==========================
+//	KernelManager
+//==========================
+Kernel* KernelManager::query(const Device* dd, const char* fileName, const char* funcName, const char* option, const char* src,
+	bool cacheKernel)
+{
+	printf("compiling kernel %s",funcName);
+	const int charSize = 1024*2;
+	KernelManager* s_kManager = this;
+
+	char fullFineName[charSize];
+	switch( dd->m_type )
+	{
+	case TYPE_CL:
+#if defined(ADL_ENABLE_CL)
+		sprintf_s(fullFineName,charSize,"%s.cl", fileName);
+		break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+	case TYPE_DX11:
+		sprintf_s(fullFineName,charSize,"%s.hlsl", fileName);
+		break;
+#endif
+	default:
+		ADLASSERT(0);
+		break;
+	};
+
+	char mapName[charSize];
+	{
+		if( option )
+			sprintf_s(mapName, charSize, "%d%s%s%s", (int)dd->getContext(), fullFineName, funcName, option);
+		else
+			sprintf_s(mapName, charSize, "%d%s%s", (int)dd->getContext(), fullFineName, funcName);
+	}
+
+	std::string str(mapName);
+
+	KMap::iterator iter = s_kManager->m_map.find( str );
+
+	Kernel* kernelOut;
+	if( iter == s_kManager->m_map.end() )
+	{
+		kernelOut = new Kernel();
+
+		switch( dd->m_type )
+		{
+#if defined(ADL_ENABLE_CL)
+		case TYPE_CL:
+			{
+				KernelBuilder<TYPE_CL> builder;
+				if( src )
+					if (cacheKernel)
+					{
+						builder.setFromSrcCached( dd, src, fileName, option );
+					} else
+					{
+						builder.setFromSrc( dd, src, option );
+					}
+				else
+					builder.setFromFile( dd, fileName, option, true, cacheKernel );
+				builder.createKernel( funcName, *kernelOut );
+			}
+			break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+		case TYPE_DX11:
+			{
+				KernelBuilder<TYPE_DX11> builder;
+				if( src )
+					builder.setFromSrc( dd, src, option );
+				else
+					builder.setFromFile( dd, fileName, option, true, cacheKernel );
+				builder.createKernel( funcName, *kernelOut );
+			}
+			break;
+#endif
+		default:
+			ADLASSERT(0);
+			break;
+		};
+		s_kManager->m_map.insert( KMap::value_type(str,kernelOut) );
+	}
+	else
+	{
+		kernelOut = iter->second;
+	}
+
+	printf(" ready\n");
+	return kernelOut;
+}
+
+KernelManager::~KernelManager()
+{
+	for(KMap::iterator iter = m_map.begin(); iter != m_map.end(); iter++)
+	{
+		Kernel* k = iter->second;
+		switch( k->m_type )
+		{
+#if defined(ADL_ENABLE_CL)
+		case TYPE_CL:
+			KernelBuilder<TYPE_CL>::deleteKernel( *k );
+			delete k;
+			break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+		case TYPE_DX11:
+			KernelBuilder<TYPE_DX11>::deleteKernel( *k );
+			delete k;
+			break;
+#endif
+		default:
+			ADLASSERT(0);
+			break;
+		};
+	}
+}
+
+//==========================
+//	Launcher
+//==========================
+
+#if defined(ADL_ENABLE_DX11)
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: LauncherCL::func; break; \
+		case TYPE_DX11: LauncherDX11::func; break; \
+		default: ADLASSERT(0); break; \
+		};
+	#else
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_DX11: LauncherDX11::func; break; \
+		default: ADLASSERT(0); break; \
+		};
+	#endif
+#else
+	#if defined(ADL_ENABLE_CL)
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		case TYPE_CL: LauncherCL::func; break; \
+		default: ADLASSERT(0); break; \
+		};
+	#else
+	#define SELECT_LAUNCHER( type, func ) \
+		switch( type ) \
+		{ \
+		default: ADLASSERT(0); break; \
+		};
+	#endif
+#endif
+
+Launcher::Launcher(const Device *dd, char *fileName, char *funcName, char *option)
+{
+	m_kernel = dd->getKernel( fileName, funcName, option );
+	m_deviceData = dd;
+	m_idx = 0;
+	m_idxRw = 0;
+}
+
+Launcher::Launcher(const Device* dd, Kernel* kernel)
+{
+	m_kernel = kernel;
+	m_deviceData = dd;
+	m_idx = 0;
+	m_idxRw = 0;
+}
+
+void Launcher::setBuffers( BufferInfo* buffInfo, int n )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, setBuffers( this, buffInfo, n ) );
+}
+
+template<typename T>
+void Launcher::setConst( Buffer<T>& constBuff, const T& consts )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, setConst( this, constBuff, consts ) );
+}
+
+void Launcher::launch1D( int numThreads, int localSize )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreads, 1, localSize, 1 ) );
+}
+
+void Launcher::launch2D(  int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+{
+	SELECT_LAUNCHER( m_deviceData->m_type, launch2D( this, numThreadsX, numThreadsY, localSizeX, localSizeY ) );
+}
+
+#undef SELECT_LAUNCHER
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.h
@@ -0,0 +1,81 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#include <windows.h>
+
+namespace adl
+{
+
+struct StopwatchBase
+{
+	__inline
+	StopwatchBase(): m_device(0){}
+	__inline
+	StopwatchBase( const Device* deviceData ){ init(deviceData); }
+	__inline
+	virtual ~StopwatchBase(){}
+
+	__inline
+	virtual void init( const Device* deviceData ) = 0;
+	__inline
+	virtual void start() = 0;
+	__inline
+	virtual void split() = 0;
+	__inline
+	virtual void stop() = 0;
+	__inline
+	virtual float getMs(int index=0) = 0;
+	__inline
+	virtual void getMs( float* times, int capacity ) = 0;
+	__inline
+	int getNIntervals() const{ return m_idx-1;}
+
+	enum
+	{
+		CAPACITY = 64,
+	};
+
+	const Device* m_device;
+	int m_idx;
+};
+
+struct Stopwatch
+{
+	__inline
+	Stopwatch( const Device* deviceData = NULL ) { m_impl=0; if(deviceData) init(deviceData);}
+	__inline
+	~Stopwatch();
+
+	__inline
+	void init( const Device* deviceData );
+	__inline
+	void start(){if(!m_impl) init(0); m_impl->start();}
+	__inline
+	void split(){m_impl->split();}
+	__inline
+	void stop(){m_impl->stop();}
+	__inline
+	float getMs(){ return m_impl->getMs();}
+	__inline
+	void getMs( float* times, int capacity ){m_impl->getMs(times, capacity);}
+	__inline
+	int getNIntervals() const{return m_impl->getNIntervals();}
+
+	StopwatchBase* m_impl;
+};
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/AdlStopwatch.inl
@@ -0,0 +1,59 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+namespace adl
+{
+
+void Stopwatch::init( const Device* deviceData )
+{
+	ADLASSERT( m_impl == 0 );
+
+	if( deviceData )
+	{
+		switch( deviceData->m_type )
+		{
+#if defined(ADL_ENABLE_CL)
+		case TYPE_CL:
+			m_impl = new StopwatchHost;//StopwatchCL
+			break;
+#endif
+#if defined(ADL_ENABLE_DX11)
+		case TYPE_DX11:
+			m_impl = new StopwatchHost;//StopwatchDX11;
+			break;
+#endif
+		case TYPE_HOST:
+			m_impl = new StopwatchHost;
+			break;
+		default:
+			ADLASSERT(0);
+			break;
+		};
+	}
+	else
+	{
+		m_impl = new StopwatchHost;
+	}
+	m_impl->init( deviceData );
+}
+
+Stopwatch::~Stopwatch()
+{
+	if( m_impl == 0 ) return;
+	delete m_impl;
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlCL.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlCL.inl
@@ -0,0 +1,384 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#pragma comment(lib,"OpenCL.lib")
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+#include <CL/cl_platform.h>
+
+namespace adl
+{
+
+struct DeviceCL : public Device
+{
+	typedef DeviceUtils::Config Config;
+
+
+	__inline
+	DeviceCL() : Device( TYPE_CL ), m_kernelManager(0){}
+	__inline
+	void* getContext() const { return m_context; }
+	__inline
+	void initialize(const Config& cfg);
+	__inline
+	void release();
+
+	template<typename T>
+	__inline
+	void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
+
+	template<typename T>
+	__inline
+	void deallocate(Buffer<T>* buf);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems = 0,int dstOffsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
+
+	__inline
+	void waitForCompletion() const;
+
+	__inline
+	void getDeviceName( char nameOut[128] ) const;
+
+	__inline
+	static
+	int getNDevices();
+
+	__inline
+	Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
+
+
+	enum
+	{
+		MAX_NUM_DEVICES = 6,
+	};
+	
+	cl_context m_context;
+	cl_command_queue m_commandQueue;
+
+	cl_device_id m_deviceIdx;
+
+	KernelManager* m_kernelManager;
+};
+
+//===
+//===
+
+void DeviceCL::initialize(const Config& cfg)
+{
+//	DeviceUtils::create( cfg, (DeviceCL*)this );
+	{
+//		dd = new DeviceCL();
+
+		DeviceCL* deviceData = (DeviceCL*)this;
+
+//		cl_device_type deviceType = (driverType == DRIVER_HARDWARE)? CL_DEVICE_TYPE_GPU:CL_DEVICE_TYPE_CPU;
+		cl_device_type deviceType = (cfg.m_type== Config::DEVICE_GPU)? CL_DEVICE_TYPE_GPU: CL_DEVICE_TYPE_CPU;
+//		int numContextQueuePairsToCreate = 1;
+		bool enableProfiling = false;
+#ifdef _DEBUG
+		enableProfiling = true;
+#endif
+		cl_int status;
+
+		cl_platform_id platform;
+		{
+			cl_uint nPlatforms = 0;
+			status = clGetPlatformIDs(0, NULL, &nPlatforms);
+			ADLASSERT( status == CL_SUCCESS );
+
+			cl_platform_id pIdx[5];
+			status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
+			ADLASSERT( status == CL_SUCCESS );
+
+			cl_uint atiIdx = -1;
+			cl_uint intelIdx = -1;
+			cl_uint nvIdx = -1;
+
+			for(cl_uint i=0; i<nPlatforms; i++)
+			{
+				char buff[512];
+				status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				//skip the platform if there are no devices available
+				cl_uint numDevice;
+				status = clGetDeviceIDs( pIdx[i], deviceType, 0, NULL, &numDevice );
+				if (numDevice>0)
+				{
+					if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
+					if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
+					if( strcmp( buff, "Intel(R) Corporation" )==0 ) intelIdx = i;
+				}
+			}
+
+			if( deviceType == CL_DEVICE_TYPE_GPU )
+			{
+				switch( cfg.m_vendor )
+				{
+				case DeviceUtils::Config::VD_AMD:
+					if( atiIdx == -1 && nvIdx != -1 ) goto USE_NV_GPU;
+USE_AMD_GPU:
+					ADLASSERT(atiIdx != -1 );
+					platform = pIdx[atiIdx];
+					break;
+				case DeviceUtils::Config::VD_NV:
+					if( atiIdx != -1 && nvIdx == -1 ) goto USE_AMD_GPU;
+USE_NV_GPU:
+					ADLASSERT(nvIdx != -1 );
+					platform = pIdx[nvIdx];
+					break;
+				default:
+					ADLASSERT(0);
+					break;
+				};
+			}
+			else if( deviceType == CL_DEVICE_TYPE_CPU )
+			{
+				switch( cfg.m_vendor )
+				{
+				case DeviceUtils::Config::VD_AMD:
+					ADLASSERT(atiIdx != -1 );
+					platform = pIdx[atiIdx];
+					break;
+				case DeviceUtils::Config::VD_INTEL:
+					ADLASSERT(intelIdx != -1 );
+					platform = pIdx[intelIdx];
+					break;
+				default:
+					ADLASSERT(0);
+					break;
+				};
+			}
+		}
+
+		cl_uint numDevice;
+		status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
+
+//		ADLASSERT( cfg.m_deviceIdx < (int)numDevice );
+
+		debugPrintf("CL: %d %s Devices ", numDevice, (deviceType==CL_DEVICE_TYPE_GPU)? "GPU":"CPU");
+
+//		numContextQueuePairsToCreate = min( (int)numDevice, numContextQueuePairsToCreate );
+//		numContextQueuePairsToCreate = ( (int)numDevice < numContextQueuePairsToCreate )? numDevice : numContextQueuePairsToCreate;
+		
+		cl_device_id deviceIds[ MAX_NUM_DEVICES ];
+
+		status = clGetDeviceIDs( platform, deviceType, numDevice, deviceIds, NULL );
+		ADLASSERT( status == CL_SUCCESS );
+
+		{	int i = min( (int)numDevice-1, cfg.m_deviceIdx );
+			m_deviceIdx = deviceIds[i];
+			deviceData->m_context = clCreateContext( NULL, 1, &deviceData->m_deviceIdx, NULL, NULL, &status );
+			ADLASSERT( status == CL_SUCCESS );
+
+			char buff[512];
+			status = clGetDeviceInfo( deviceData->m_deviceIdx, CL_DEVICE_NAME, sizeof(buff), &buff, NULL );
+			ADLASSERT( status == CL_SUCCESS );
+
+			debugPrintf("[%s]\n", buff);
+
+			deviceData->m_commandQueue = clCreateCommandQueue( deviceData->m_context, deviceData->m_deviceIdx, (enableProfiling)?CL_QUEUE_PROFILING_ENABLE:NULL, NULL );
+
+			ADLASSERT( status == CL_SUCCESS );
+
+		//	status = clSetCommandQueueProperty( commandQueue, CL_QUEUE_PROFILING_ENABLE, CL_TRUE, 0 );
+		//	CLASSERT( status == CL_SUCCESS );
+
+			if(0)
+			{
+				cl_bool image_support;
+				clGetDeviceInfo(deviceData->m_deviceIdx, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
+				debugPrintf("	CL_DEVICE_IMAGE_SUPPORT : %s\n", image_support?"Yes":"No");
+			}
+		}
+	}
+
+	m_kernelManager = new KernelManager;
+}
+
+void DeviceCL::release()
+{
+	clReleaseCommandQueue( m_commandQueue );
+	clReleaseContext( m_context );
+
+	if( m_kernelManager ) delete m_kernelManager;
+}
+
+template<typename T>
+void DeviceCL::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
+{
+	buf->m_device = this;
+	buf->m_size = nElems;
+	buf->m_ptr = 0;
+
+	if( type == BufferBase::BUFFER_CONST ) return;
+
+#if defined(ADL_CL_DUMP_MEMORY_LOG)
+	char deviceName[256];
+	getDeviceName( deviceName );
+   	printf( "adlCLMemoryLog	%s : %3.2fMB	Allocation: %3.2fKB ", deviceName, m_memoryUsage/1024.f/1024.f, sizeof(T)*nElems/1024.f );
+	fflush( stdout );
+#endif
+
+	int sz=sizeof(T)*nElems;
+
+	cl_int status = 0;
+	if( type == BufferBase::BUFFER_ZERO_COPY )
+		buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sz, 0, &status );
+	else if( type == BufferBase::BUFFER_RAW )
+		buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_WRITE_ONLY, sz, 0, &status );
+	else
+		buf->m_ptr = (T*)clCreateBuffer( m_context, CL_MEM_READ_WRITE, sz, 0, &status );
+
+	m_memoryUsage += buf->m_size*sizeof(T);
+#if defined(ADL_CL_DUMP_MEMORY_LOG)
+	printf( "%s\n", (status==CL_SUCCESS)? "Succeed": "Failed" );
+	fflush( stdout );
+#endif
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+template<typename T>
+void DeviceCL::deallocate(Buffer<T>* buf)
+{
+	if( buf->m_ptr )
+	{
+		m_memoryUsage -= buf->m_size*sizeof(T);
+		clReleaseMemObject( (cl_mem)buf->m_ptr );
+	}
+	buf->m_device = 0;
+	buf->m_size = 0;
+	buf->m_ptr = 0;
+}
+
+template<typename T>
+void DeviceCL::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems,int srcOffsetNElems,int dstOffsetNElems )
+{
+	if( dst->m_device->m_type == TYPE_CL && src->m_device->m_type == TYPE_CL )
+	{
+		cl_int status = 0;
+		status = clEnqueueCopyBuffer( m_commandQueue, (cl_mem)src->m_ptr, (cl_mem)dst->m_ptr, sizeof(T)*srcOffsetNElems, sizeof(T)*dstOffsetNElems, sizeof(T)*nElems, 0, 0, 0 );
+		ADLASSERT( status == CL_SUCCESS );
+	}
+	else if( src->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( dst->getType() == TYPE_CL );
+		dst->write( src->m_ptr, nElems );
+	}
+	else if( dst->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( src->getType() == TYPE_CL );
+		src->read( dst->m_ptr, nElems );
+	}
+	else
+	{
+		ADLASSERT( 0 );
+	}
+}
+
+template<typename T>
+void DeviceCL::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems )
+{
+	cl_int status = 0;
+	status = clEnqueueReadBuffer( m_commandQueue, (cl_mem)src->m_ptr, 0, sizeof(T)*srcOffsetNElems, sizeof(T)*nElems,
+		dst, 0,0,0 );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+template<typename T>
+void DeviceCL::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems )
+{
+	cl_int status = 0;
+	int sz=sizeof(T)*nElems;
+	status = clEnqueueWriteBuffer( m_commandQueue, (cl_mem)dst->m_ptr, 0, sizeof(T)*dstOffsetNElems, sz,
+		src, 0,0,0 );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+void DeviceCL::waitForCompletion() const
+{
+	clFinish( m_commandQueue );
+}
+
+int DeviceCL::getNDevices()
+{
+	cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+	cl_int status;
+
+	cl_platform_id platform;
+	{
+		cl_uint nPlatforms = 0;
+		status = clGetPlatformIDs(0, NULL, &nPlatforms);
+		ADLASSERT( status == CL_SUCCESS );
+
+		cl_platform_id pIdx[5];
+		status = clGetPlatformIDs(nPlatforms, pIdx, NULL);
+		ADLASSERT( status == CL_SUCCESS );
+
+		cl_uint nvIdx = -1;
+		cl_uint atiIdx = -1;
+		for(cl_uint i=0; i<nPlatforms; i++)
+		{
+			char buff[512];
+			status = clGetPlatformInfo( pIdx[i], CL_PLATFORM_VENDOR, 512, buff, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+
+			if( strcmp( buff, "NVIDIA Corporation" )==0 ) nvIdx = i;
+			if( strcmp( buff, "Advanced Micro Devices, Inc." )==0 ) atiIdx = i;
+		}
+
+		if( deviceType == CL_DEVICE_TYPE_GPU )
+		{
+			if( nvIdx != -1 ) platform = pIdx[nvIdx];
+			else platform = pIdx[atiIdx];
+		}
+		else if( deviceType == CL_DEVICE_TYPE_CPU )
+		{
+			platform = pIdx[atiIdx];
+		}
+	}
+
+	cl_uint numDevice;
+	status = clGetDeviceIDs( platform, deviceType, 0, NULL, &numDevice );
+	ADLASSERT( status == CL_SUCCESS );
+
+	return numDevice;
+}
+
+void DeviceCL::getDeviceName( char nameOut[128] ) const
+{
+	cl_int status;
+	status = clGetDeviceInfo( m_deviceIdx, CL_DEVICE_NAME, sizeof(char)*128, nameOut, NULL );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+Kernel* DeviceCL::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel )const
+{
+	return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlKernelUtilsCL.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/CL/AdlKernelUtilsCL.inl
@@ -0,0 +1,541 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+
+
+namespace adl
+{
+
+struct KernelCL : public Kernel
+{
+	cl_kernel& getKernel() { return (cl_kernel&)m_kernel; }
+};
+
+static const char* strip(const char* name, const char* pattern)
+{
+	  size_t const patlen = strlen(pattern);
+  	size_t patcnt = 0;
+	  const char * oriptr;
+	  const char * patloc;
+		// find how many times the pattern occurs in the original string
+	  for (oriptr = name; patloc = strstr(oriptr, pattern); oriptr = patloc + patlen)
+	  {
+		patcnt++;
+	  }
+	  return oriptr;
+}
+
+static bool isFileUpToDate(const char* binaryFileName,const char* srcFileName)
+
+{
+	bool fileUpToDate = false;
+
+	bool binaryFileValid=false;
+	FILETIME modtimeBinary; 
+
+	int nameLength = (int)strlen(binaryFileName)+1;
+#ifdef UNICODE
+	WCHAR* fName = new WCHAR[nameLength];
+	MultiByteToWideChar(CP_ACP,0,binaryFileName,-1, fName, nameLength);
+	HANDLE binaryFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+	delete [] fName;
+#else
+	HANDLE binaryFileHandle = CreateFile(binaryFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+#endif
+	if (binaryFileHandle ==INVALID_HANDLE_VALUE)
+	{
+		DWORD errorCode;
+		errorCode = GetLastError();
+		switch (errorCode)
+		{
+		case ERROR_FILE_NOT_FOUND:
+			{
+				debugPrintf("\nCached file not found %s\n", binaryFileName);
+				break;
+			}
+		case ERROR_PATH_NOT_FOUND:
+			{
+				debugPrintf("\nCached file path not found %s\n", binaryFileName);
+				break;
+			}
+		default:
+			{
+				debugPrintf("\nFailed reading cached file with errorCode = %d\n", errorCode);
+			}
+		}
+	} else
+	{
+		if (GetFileTime(binaryFileHandle, NULL, NULL, &modtimeBinary)==0)
+		{
+			DWORD errorCode;
+			errorCode = GetLastError();
+			debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
+		} else
+		{
+			binaryFileValid = true;
+		}
+		CloseHandle(binaryFileHandle);
+	}
+
+	if (binaryFileValid)
+	{
+#ifdef UNICODE
+		int nameLength = (int)strlen(srcFileName)+1;
+		WCHAR* fName = new WCHAR[nameLength];
+		MultiByteToWideChar(CP_ACP,0,srcFileName,-1, fName, nameLength);
+		HANDLE srcFileHandle = CreateFile(fName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+		delete [] fName;
+#else
+		HANDLE srcFileHandle = CreateFile(srcFileName,GENERIC_READ,0,0,OPEN_EXISTING,FILE_ATTRIBUTE_NORMAL,0);
+#endif
+		if (srcFileHandle!=INVALID_HANDLE_VALUE)
+		{
+			FILETIME modtimeSrc; 
+			if (GetFileTime(srcFileHandle, NULL, NULL, &modtimeSrc)==0)
+			{
+				DWORD errorCode;
+				errorCode = GetLastError();
+				debugPrintf("\nGetFileTime errorCode = %d\n", errorCode);
+			}
+			if (  ( modtimeSrc.dwHighDateTime < modtimeBinary.dwHighDateTime)
+				||(( modtimeSrc.dwHighDateTime == modtimeBinary.dwHighDateTime)&&(modtimeSrc.dwLowDateTime <= modtimeBinary.dwLowDateTime)))
+			{
+				fileUpToDate=true;
+			} else
+			{
+				debugPrintf("\nCached binary file found (%s), but out-of-date\n",binaryFileName);
+			}
+			CloseHandle(srcFileHandle);
+		} 
+		else
+		{
+#ifdef _DEBUG
+			DWORD errorCode;
+			errorCode = GetLastError();
+			switch (errorCode)
+			{
+			case ERROR_FILE_NOT_FOUND:
+				{
+					debugPrintf("\nSrc file not found %s\n", srcFileName);
+					break;
+				}
+			case ERROR_PATH_NOT_FOUND:
+				{
+					debugPrintf("\nSrc path not found %s\n", srcFileName);
+					break;
+				}
+			default:
+				{
+					debugPrintf("\nnSrc file reading errorCode = %d\n", errorCode);
+				}
+			}
+			ADLASSERT(0);
+#else
+			//if we cannot find the source, assume it is OK in release builds
+			fileUpToDate = true;
+#endif
+		}
+	}
+			
+
+	return fileUpToDate;
+}
+
+template<>
+void KernelBuilder<TYPE_CL>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
+	bool cacheKernel)
+{
+	m_deviceData = deviceData;
+
+	char fileNameWithExtension[256];
+
+	if( addExtension )
+		sprintf_s( fileNameWithExtension, "%s.cl", fileName );
+	else
+		sprintf_s( fileNameWithExtension, "%s", fileName );
+
+	class File
+	{
+		public:
+			__inline
+			bool open(const char* fileNameWithExtension)
+			{
+				size_t      size;
+				char*       str;
+
+				// Open file stream
+				std::fstream f(fileNameWithExtension, (std::fstream::in | std::fstream::binary));
+
+				// Check if we have opened file stream
+				if (f.is_open()) {
+					size_t  sizeFile;
+					// Find the stream size
+					f.seekg(0, std::fstream::end);
+					size = sizeFile = (size_t)f.tellg();
+					f.seekg(0, std::fstream::beg);
+
+					str = new char[size + 1];
+					if (!str) {
+						f.close();
+						return  NULL;
+					}
+
+					// Read file
+					f.read(str, sizeFile);
+					f.close();
+					str[size] = '\0';
+
+					m_source  = str;
+
+					delete[] str;
+
+					return true;
+				}
+
+				return false;
+			}
+			const std::string& getSource() const {return m_source;}
+
+		private:
+			std::string m_source;
+	};
+
+	cl_program& program = (cl_program&)m_ptr;
+	cl_int status = 0;
+
+	bool cacheBinary = cacheKernel;
+#if defined(ADL_CL_FORCE_UNCACHE_KERNEL)
+	cacheBinary = false;
+#endif
+
+	char binaryFileName[512];
+	{
+		char deviceName[256];
+		deviceData->getDeviceName(deviceName);
+		char driverVersion[256];
+		const DeviceCL* dd = (const DeviceCL*) deviceData;
+		clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+		const char* strippedFileName = strip(fileName,"\\");
+		strippedFileName = strip(strippedFileName,"/");
+
+		sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
+	}
+
+	bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
+
+	if( cacheBinary && upToDate)
+	{
+		FILE* file = fopen(binaryFileName, "rb");
+
+		if( file )
+		{
+			fseek( file, 0L, SEEK_END );
+			size_t binarySize = ftell( file );
+
+			rewind( file );
+			char* binary = new char[binarySize];
+			fread( binary, sizeof(char), binarySize, file );
+			fclose( file );
+
+			if (binarySize)
+			{
+				const DeviceCL* dd = (const DeviceCL*) deviceData;
+				program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
+				ADLASSERT( status == CL_SUCCESS );
+				status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+			if( status != CL_SUCCESS )
+			{
+				char *build_log;
+				size_t ret_val_size;
+				clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+				build_log = new char[ret_val_size+1];
+				clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+				build_log[ret_val_size] = '\0';
+
+				debugPrintf("%s\n", build_log);
+
+				delete build_log;
+				ADLASSERT(0);
+				}
+
+			}
+		}
+	}
+	if( !m_ptr )
+	{
+		File kernelFile;
+		ADLASSERT( kernelFile.open( fileNameWithExtension ) );
+		const char* source = kernelFile.getSource().c_str();
+		setFromSrc( m_deviceData, source, option );
+
+		if( cacheBinary )
+		{	//	write to binary
+			size_t binarySize;
+			status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+
+			char* binary = new char[binarySize];
+
+			status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+
+			{
+				FILE* file = fopen(binaryFileName, "wb");
+				if (file)
+				{
+					fwrite( binary, sizeof(char), binarySize, file );
+					fclose( file );
+				}
+			}
+
+			delete [] binary;
+		}
+	}
+}
+
+
+
+template<>
+void KernelBuilder<TYPE_CL>::setFromSrcCached( const Device* deviceData, const char* src, const char* fileName, const char* option )
+{
+	m_deviceData = deviceData;
+
+	bool cacheBinary = true;
+	cl_program& program = (cl_program&)m_ptr;
+	cl_int status = 0;	
+	
+	char binaryFileName[512];
+	{
+		char deviceName[256];
+		deviceData->getDeviceName(deviceName);
+		char driverVersion[256];
+		const DeviceCL* dd = (const DeviceCL*) deviceData;
+		clGetDeviceInfo(dd->m_deviceIdx, CL_DRIVER_VERSION, 256, &driverVersion, NULL);
+		
+		const char* strippedFileName = strip(fileName,"\\");
+		strippedFileName = strip(strippedFileName,"/");
+
+		sprintf_s(binaryFileName,"cache/%s.%s.%s.bin",strippedFileName, deviceName,driverVersion );
+	}
+
+	
+	char fileNameWithExtension[256];
+	sprintf_s(fileNameWithExtension,"%s.cl",fileName, ".cl");
+
+	bool upToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
+
+
+	if( cacheBinary )
+	{
+		
+		bool fileUpToDate = isFileUpToDate(binaryFileName,fileNameWithExtension);
+
+		if( fileUpToDate)
+		{
+			FILE* file = fopen(binaryFileName, "rb");
+			if (file)
+			{
+				fseek( file, 0L, SEEK_END );
+				size_t binarySize = ftell( file );
+				rewind( file );
+				char* binary = new char[binarySize];
+				fread( binary, sizeof(char), binarySize, file );
+				fclose( file );
+
+				const DeviceCL* dd = (const DeviceCL*) deviceData;
+				program = clCreateProgramWithBinary( dd->m_context, 1, &dd->m_deviceIdx, &binarySize, (const unsigned char**)&binary, 0, &status );
+				ADLASSERT( status == CL_SUCCESS );
+				status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, 0, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				if( status != CL_SUCCESS )
+				{
+					char *build_log;
+					size_t ret_val_size;
+					clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+					build_log = new char[ret_val_size+1];
+					clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+					build_log[ret_val_size] = '\0';
+
+					debugPrintf("%s\n", build_log);
+
+					delete build_log;
+					ADLASSERT(0);
+				}
+				delete[] binary;
+			}
+		}
+	}
+
+
+	if( !m_ptr )
+	{
+		
+		setFromSrc( deviceData, src, option );
+
+		if( cacheBinary )
+		{	//	write to binary
+			cl_uint numAssociatedDevices;
+			status = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &numAssociatedDevices, 0 );
+			ADLASSERT( status == CL_SUCCESS );
+			if (numAssociatedDevices==1)
+			{
+			
+
+				size_t binarySize;
+				status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				char* binary = new char[binarySize];
+
+				status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(char*), &binary, 0 );
+				ADLASSERT( status == CL_SUCCESS );
+
+				{
+					FILE* file = fopen(binaryFileName, "wb");
+					if (file)
+					{
+						fwrite( binary, sizeof(char), binarySize, file );
+						fclose( file );
+					}
+				}
+
+				delete [] binary;
+			}
+		}
+	}
+}
+
+
+template<>
+void KernelBuilder<TYPE_CL>::setFromSrc( const Device* deviceData, const char* src, const char* option )
+{
+	ADLASSERT( deviceData->m_type == TYPE_CL );
+	m_deviceData = deviceData;
+	const DeviceCL* dd = (const DeviceCL*) deviceData;
+
+	cl_program& program = (cl_program&)m_ptr;
+	cl_int status = 0;
+	size_t srcSize[] = {strlen( src )};
+	program = clCreateProgramWithSource( dd->m_context, 1, &src, srcSize, &status );
+	ADLASSERT( status == CL_SUCCESS );
+	status = clBuildProgram( program, 1, &dd->m_deviceIdx, option, NULL, NULL );
+	if( status != CL_SUCCESS )
+	{
+		char *build_log;
+		size_t ret_val_size;
+		clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+		build_log = new char[ret_val_size+1];
+		clGetProgramBuildInfo(program, dd->m_deviceIdx, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+
+		build_log[ret_val_size] = '\0';
+
+		debugPrintf("%s\n", build_log);
+		printf("%s\n", build_log);
+
+		ADLASSERT(0);
+		delete build_log;
+		
+	}
+}
+
+template<>
+KernelBuilder<TYPE_CL>::~KernelBuilder()
+{
+	cl_program program = (cl_program)m_ptr;
+	clReleaseProgram( program );
+}
+
+template<>
+void KernelBuilder<TYPE_CL>::createKernel( const char* funcName, Kernel& kernelOut )
+{
+	KernelCL* clKernel = (KernelCL*)&kernelOut;
+
+	cl_program program = (cl_program)m_ptr;
+	cl_int status = 0;
+	clKernel->getKernel() = clCreateKernel(program, funcName, &status );
+	ADLASSERT( status == CL_SUCCESS );
+
+	kernelOut.m_type = TYPE_CL;
+}
+
+template<>
+void KernelBuilder<TYPE_CL>::deleteKernel( Kernel& kernel )
+{
+	KernelCL* clKernel = (KernelCL*)&kernel;
+	clReleaseKernel( clKernel->getKernel() );
+}
+
+
+
+class LauncherCL
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		__inline
+		static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
+		template<typename T>
+		__inline
+		static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
+		__inline
+		static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
+};
+
+void LauncherCL::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
+{
+	KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
+	for(int i=0; i<n; i++)
+	{
+		Buffer<int>* buff = (Buffer<int>*)buffInfo[i].m_buffer;
+		cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sizeof(cl_mem), &buff->m_ptr );
+		ADLASSERT( status == CL_SUCCESS );
+	}
+}
+
+template<typename T>
+void LauncherCL::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
+{
+	KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
+	int sz=sizeof(T);
+	cl_int status = clSetKernelArg( clKernel->getKernel(), launcher->m_idx++, sz, &consts );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+void LauncherCL::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+{
+	KernelCL* clKernel = (KernelCL*)launcher->m_kernel;
+	const DeviceCL* ddcl = (const DeviceCL*)launcher->m_deviceData;
+	size_t gRange[3] = {1,1,1};
+	size_t lRange[3] = {1,1,1};
+	lRange[0] = localSizeX;
+	lRange[1] = localSizeY;
+	gRange[0] = max((size_t)1, (numThreadsX/lRange[0])+(!(numThreadsX%lRange[0])?0:1));
+	gRange[0] *= lRange[0];
+	gRange[1] = max((size_t)1, (numThreadsY/lRange[1])+(!(numThreadsY%lRange[1])?0:1));
+	gRange[1] *= lRange[1];
+
+	cl_int status = clEnqueueNDRangeKernel( ddcl->m_commandQueue, 
+		clKernel->getKernel(), 2, NULL, gRange, lRange, 0,0,0 );
+	ADLASSERT( status == CL_SUCCESS );
+}
+
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlDX11.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlDX11.inl
@@ -0,0 +1,512 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#include <windows.h>
+#include <d3d11.h>
+#include <d3dx11.h>
+#include <d3dcompiler.h>
+#include <DXGI.h>
+#pragma comment(lib,"d3dx11.lib")
+#pragma comment(lib,"d3d11.lib")
+#pragma comment(lib,"DXGI.lib")
+
+namespace adl
+{
+
+#define u32 unsigned int
+
+struct DeviceDX11 : public Device
+{
+	typedef DeviceUtils::Config Config;
+
+
+	__inline
+	DeviceDX11() : Device( TYPE_DX11 ), m_kernelManager(0){}
+	__inline
+	void* getContext() const { return m_context; }
+	__inline
+	void initialize(const Config& cfg);
+	__inline
+	void release();
+
+	template<typename T>
+	__inline
+	void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
+
+	template<typename T>
+	__inline
+	void deallocate(Buffer<T>* buf);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
+
+	template<typename T>
+	__inline
+	void copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems = 0);
+
+	__inline
+	void waitForCompletion() const;
+
+	__inline
+	void getDeviceName( char nameOut[128] ) const;
+
+	__inline
+	static
+	int getNDevices();
+
+	__inline
+	Kernel* getKernel(const char* fileName, const char* funcName, const char* option = NULL, const char* src = NULL, bool cacheKernel = true )const;
+
+
+	ID3D11DeviceContext* m_context;
+	ID3D11Device* m_device;
+	IDXGISwapChain* m_swapChain;
+
+	KernelManager* m_kernelManager;
+};
+
+template<typename T>
+struct BufferDX11 : public Buffer<T>
+{
+	ID3D11Buffer* getBuffer() { return (ID3D11Buffer*)m_ptr; }
+	ID3D11UnorderedAccessView* getUAV() { return (ID3D11UnorderedAccessView*)m_uav; }
+	ID3D11ShaderResourceView* getSRV() { return (ID3D11ShaderResourceView*)m_srv; }
+
+	ID3D11Buffer** getBufferPtr() { return (ID3D11Buffer**)&m_ptr; }
+	ID3D11UnorderedAccessView** getUAVPtr() { return (ID3D11UnorderedAccessView**)&m_uav; }
+	ID3D11ShaderResourceView** getSRVPtr() { return (ID3D11ShaderResourceView**)&m_srv; }
+};
+
+#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
+
+
+void DeviceDX11::initialize(const Config& cfg)
+{
+	DeviceDX11* deviceData = this;
+
+	HRESULT hr = S_OK;
+	UINT createDeviceFlg = 0;
+#ifdef _DEBUG
+	createDeviceFlg |= D3D11_CREATE_DEVICE_DEBUG;
+#endif
+	D3D_FEATURE_LEVEL fl[] = {
+		D3D_FEATURE_LEVEL_11_0,
+		D3D_FEATURE_LEVEL_10_1,
+		D3D_FEATURE_LEVEL_10_0
+	};
+
+typedef HRESULT (WINAPI * LPD3D11CREATEDEVICE)( IDXGIAdapter*, D3D_DRIVER_TYPE, HMODULE, u32, D3D_FEATURE_LEVEL*, UINT, u32, ID3D11Device**, D3D_FEATURE_LEVEL*, ID3D11DeviceContext** );
+
+	HMODULE moduleD3D11 = 0; 
+#ifdef UNICODE
+	moduleD3D11 = LoadLibrary( L"d3d11.dll" );
+#else
+	moduleD3D11 = LoadLibrary( "d3d11.dll" );
+#endif
+	ADLASSERT( moduleD3D11 );
+
+	LPD3D11CREATEDEVICE _DynamicD3D11CreateDevice; 
+	_DynamicD3D11CreateDevice = ( LPD3D11CREATEDEVICE )GetProcAddress( moduleD3D11, "D3D11CreateDevice" );
+
+	D3D_DRIVER_TYPE type = D3D_DRIVER_TYPE_HARDWARE;
+	//	http://msdn.microsoft.com/en-us/library/ff476082(v=VS.85).aspx
+	//	If you set the pAdapter parameter to a non-NULL value, you must also set the DriverType parameter to the D3D_DRIVER_TYPE_UNKNOWN value. If you set the pAdapter parameter to a non-NULL value and the DriverType parameter to the D3D_DRIVER_TYPE_HARDWARE value, D3D11CreateDevice returns an HRESULT of E_INVALIDARG.
+	type = D3D_DRIVER_TYPE_UNKNOWN;
+/*
+	// Create a hardware Direct3D 11 device
+	hr = _DynamicD3D11CreateDevice( NULL, 
+		type, NULL, createDeviceFlg,
+		fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
+*/
+	IDXGIAdapter* adapter = NULL;
+	{//	get adapter of the index
+		IDXGIFactory* factory = NULL;
+		int targetAdapterIdx = cfg.m_deviceIdx;//min( cfg.m_deviceIdx, getNDevices()-1 );
+		CreateDXGIFactory( __uuidof(IDXGIFactory), (void**)&factory );
+
+		u32 i = 0;
+		while( factory->EnumAdapters( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
+		{
+			if( i== targetAdapterIdx ) break;
+			i++;
+		}
+		factory->Release();
+	}
+
+	// Create a hardware Direct3D 11 device
+	hr = D3D11CreateDevice( adapter, 
+		type, 
+		NULL, createDeviceFlg,
+		fl, _countof(fl), D3D11_SDK_VERSION, &deviceData->m_device, NULL, &deviceData->m_context );
+
+	ADLASSERT( hr == S_OK );
+
+   // Check if the hardware device supports Compute Shader 4.0
+    D3D11_FEATURE_DATA_D3D10_X_HARDWARE_OPTIONS hwopts;
+    deviceData->m_device->CheckFeatureSupport(D3D11_FEATURE_D3D10_X_HARDWARE_OPTIONS, &hwopts, sizeof(hwopts));
+
+	if( !hwopts.ComputeShaders_Plus_RawAndStructuredBuffers_Via_Shader_4_x )
+	{
+		SAFE_RELEASE( deviceData->m_context );
+		SAFE_RELEASE( deviceData->m_device );
+
+		debugPrintf("DX11 GPU is not present\n");
+		ADLASSERT( 0 );
+	}
+
+	m_kernelManager = new KernelManager;
+}
+
+void DeviceDX11::release()
+{
+	SAFE_RELEASE( m_context );
+	SAFE_RELEASE( m_device );
+
+	if( m_kernelManager ) delete m_kernelManager;
+}
+
+template<typename T>
+void DeviceDX11::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
+{
+	ADLASSERT( type != BufferBase::BUFFER_ZERO_COPY );
+
+	DeviceDX11* deviceData = this;
+	buf->m_device = deviceData;
+	buf->m_size = nElems;
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
+
+//	if( type & BufferBase::BUFFER )
+	{
+		HRESULT hr = S_OK;
+
+		if( type == BufferBase::BUFFER_CONST )
+		{
+			ADLASSERT( nElems == 1 );
+			D3D11_BUFFER_DESC constant_buffer_desc;
+			ZeroMemory( &constant_buffer_desc, sizeof(constant_buffer_desc) );
+//			constant_buffer_desc.ByteWidth = NEXTMULTIPLEOF( sizeof(T), 16 );
+			constant_buffer_desc.ByteWidth = (((sizeof(T))/(16) + (((sizeof(T))%(16)==0)?0:1))*(16));
+//			constant_buffer_desc.Usage = D3D11_USAGE_DYNAMIC;
+//			constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+//			constant_buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
+			constant_buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			constant_buffer_desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
+			constant_buffer_desc.CPUAccessFlags = 0;
+
+			hr = deviceData->m_device->CreateBuffer( &constant_buffer_desc, NULL, dBuf->getBufferPtr() );
+			ADLASSERT( hr == S_OK );
+			return;
+		}
+
+		D3D11_BUFFER_DESC buffer_desc;
+		ZeroMemory(&buffer_desc, sizeof(buffer_desc));
+		buffer_desc.ByteWidth = nElems * sizeof(T);
+
+		if( type != BufferBase::BUFFER_RAW )
+		{
+			buffer_desc.StructureByteStride = sizeof(T);
+//		    buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+		}
+
+		if( type == BufferBase::BUFFER_STAGING )
+		{
+			buffer_desc.Usage = D3D11_USAGE_STAGING;
+		    buffer_desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
+		}
+		else if( type == BufferBase::BUFFER_INDEX )
+		{
+			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			buffer_desc.BindFlags = D3D11_BIND_INDEX_BUFFER;
+		}
+		else if( type == BufferBase::BUFFER_VERTEX )
+		{
+			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			buffer_desc.BindFlags = D3D11_BIND_VERTEX_BUFFER;
+		}
+		else
+		{
+			buffer_desc.Usage = D3D11_USAGE_DEFAULT;
+			
+			buffer_desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
+			buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
+
+//	check this
+			if(type == BufferBase::BUFFER_RAW)
+			{
+//				buffer_desc.BindFlags |= D3D11_BIND_INDEX_BUFFER | D3D11_BIND_VERTEX_BUFFER;
+				buffer_desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS | D3D11_RESOURCE_MISC_DRAWINDIRECT_ARGS; // need this to be used for DispatchIndirect
+			}
+		}
+		hr = deviceData->m_device->CreateBuffer(&buffer_desc, NULL, dBuf->getBufferPtr());
+
+		ADLASSERT( hr == S_OK );
+
+		if( type == BufferBase::BUFFER_INDEX ) return;
+
+		if( type == BufferBase::BUFFER || 
+			type == BufferBase::BUFFER_RAW || 
+			type == BufferBase::BUFFER_W_COUNTER )
+		{
+			// Create UAVs for all CS buffers
+			D3D11_UNORDERED_ACCESS_VIEW_DESC uavbuffer_desc;
+			ZeroMemory(&uavbuffer_desc, sizeof(uavbuffer_desc));
+			uavbuffer_desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+
+			if( type == BufferBase::BUFFER_RAW )
+			{
+				uavbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+				uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_RAW;
+				uavbuffer_desc.Buffer.NumElements = buffer_desc.ByteWidth / 4; 
+			}
+			else
+			{
+				uavbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+				uavbuffer_desc.Buffer.NumElements = nElems;
+			}
+
+			if( type == BufferBase::BUFFER_W_COUNTER )
+			{
+				uavbuffer_desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_COUNTER;
+			}
+
+			hr = deviceData->m_device->CreateUnorderedAccessView(dBuf->getBuffer(), &uavbuffer_desc, dBuf->getUAVPtr());
+			ADLASSERT( hr == S_OK );
+
+			// Create SRVs for all CS buffers
+			D3D11_SHADER_RESOURCE_VIEW_DESC srvbuffer_desc;
+			ZeroMemory(&srvbuffer_desc, sizeof(srvbuffer_desc));
+			if( type == BufferBase::BUFFER_RAW )
+			{
+				ADLASSERT( sizeof(T) <= 16 );
+				srvbuffer_desc.Format = DXGI_FORMAT_R32_UINT;
+				srvbuffer_desc.Buffer.ElementWidth = nElems;
+//			if ( buffer_desc.MiscFlags & D3D11_RESOURCE_MISC_BUFFER_ALLOW_RAW_VIEWS )
+//			{
+//				srvbuffer_desc.Format = DXGI_FORMAT_R32_TYPELESS;
+//				srvbuffer_desc.BufferEx.Flags = D3D11_BUFFEREX_SRV_FLAG_RAW;
+//				srvbuffer_desc.BufferEx.NumElements = buffer_desc.ByteWidth / 4;
+			}
+			else
+			{
+				srvbuffer_desc.Format = DXGI_FORMAT_UNKNOWN;
+				srvbuffer_desc.Buffer.ElementWidth = nElems;
+			}
+			srvbuffer_desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFER;
+
+			hr = deviceData->m_device->CreateShaderResourceView(dBuf->getBuffer(), &srvbuffer_desc, dBuf->getSRVPtr());
+			ADLASSERT( hr == S_OK );
+		}
+		else if( type == BufferBase::BUFFER_APPEND )
+		{
+			D3D11_UNORDERED_ACCESS_VIEW_DESC desc;
+			ZeroMemory( &desc, sizeof(desc) );
+			desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
+			desc.Buffer.FirstElement = 0;
+
+			desc.Buffer.Flags = D3D11_BUFFER_UAV_FLAG_APPEND;
+
+			desc.Format = DXGI_FORMAT_UNKNOWN;      // Format must be must be DXGI_FORMAT_UNKNOWN, when creating a View of a Structured Buffer
+			desc.Buffer.NumElements = buffer_desc.ByteWidth / buffer_desc.StructureByteStride; 
+
+			hr = deviceData->m_device->CreateUnorderedAccessView( dBuf->getBuffer(), &desc, dBuf->getUAVPtr() );
+			ADLASSERT( hr == S_OK );
+		}
+	}
+//	else
+//	{
+//		ADLASSERT(0);
+//	}
+}
+
+template<typename T>
+void DeviceDX11::deallocate(Buffer<T>* buf)
+{
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)buf;
+
+	if( dBuf->getBuffer() )
+	{
+		dBuf->getBuffer()->Release();
+		dBuf->m_ptr = NULL;
+	}
+	if( dBuf->getUAV() )
+	{
+		dBuf->getUAV()->Release();
+		dBuf->m_uav = NULL;
+	}
+	if( dBuf->getSRV() )
+	{
+		dBuf->getSRV()->Release();
+		dBuf->m_srv = NULL;
+	}
+	buf->m_device = 0;
+}
+
+template<typename T>
+void DeviceDX11::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
+{
+	if( dst->m_device->m_type == TYPE_DX11 || src->m_device->m_type == TYPE_DX11 )
+	{
+		DeviceDX11* deviceData = this;
+		BufferDX11<T>* dDst = (BufferDX11<T>*)dst;
+		BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
+
+		D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
+
+		D3D11_BOX destRegion;
+		destRegion.left = 0*sizeof(T);
+		destRegion.front = 0;
+		destRegion.top = 0;
+		destRegion.bottom = 1;
+		destRegion.back = 1;
+		destRegion.right = (0+nElems)*sizeof(T);
+
+		deviceData->m_context->CopySubresourceRegion(
+				dDst->getBuffer(),
+				0, 0, 0, 0,
+				dSrc->getBuffer(),
+				0,
+				&destRegion );
+
+	}
+	else if( src->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( dst->getType() == TYPE_DX11 );
+		dst->write( src->m_ptr, nElems );
+	}
+	else if( dst->m_device->m_type == TYPE_HOST )
+	{
+		ADLASSERT( src->getType() == TYPE_DX11 );
+		src->read( dst->m_ptr, nElems );
+	}
+	else
+	{
+		ADLASSERT( 0 );
+	}
+}
+
+template<typename T>
+void DeviceDX11::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
+{
+	DeviceDX11* deviceData = this;
+	BufferDX11<T>* dSrc = (BufferDX11<T>*)src;
+	Buffer<T> sBuf( deviceData, nElems, BufferBase::BUFFER_STAGING );
+	BufferDX11<T>* dStagingBuf = (BufferDX11<T>*)&sBuf;
+
+
+	ID3D11Buffer *StagingBuffer = dStagingBuf->getBuffer();
+    D3D11_MAPPED_SUBRESOURCE MappedVelResource = {0};
+
+    D3D11_BOX destRegion;
+    destRegion.left = srcOffsetNElems*sizeof(T);
+    destRegion.front = 0;
+    destRegion.top = 0;
+    destRegion.bottom = 1;
+    destRegion.back = 1;
+    destRegion.right = (srcOffsetNElems+nElems)*sizeof(T);
+
+    deviceData->m_context->CopySubresourceRegion(
+            StagingBuffer,
+            0, 0, 0, 0,
+			dSrc->getBuffer(),
+            0,
+            &destRegion);
+
+    deviceData->m_context->Map(StagingBuffer, 0, D3D11_MAP_READ, 0, &MappedVelResource);
+    memcpy(dst, MappedVelResource.pData, nElems*sizeof(T));
+    deviceData->m_context->Unmap(StagingBuffer, 0);
+}
+
+template<typename T>
+void DeviceDX11::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
+{
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)dst;
+
+	DeviceDX11* deviceData = this;
+
+    D3D11_BOX destRegion;
+    destRegion.left = dstOffsetNElems*sizeof(T);
+    destRegion.front = 0;
+    destRegion.top = 0;
+    destRegion.bottom = 1;
+    destRegion.back = 1;
+    destRegion.right = (dstOffsetNElems+nElems)*sizeof(T);
+	deviceData->m_context->UpdateSubresource(dBuf->getBuffer(), 0, &destRegion, src, 0, 0);
+}
+
+void DeviceDX11::waitForCompletion() const
+{
+	const DeviceDX11* deviceData = this;
+
+	ID3D11Query* syncQuery;
+	D3D11_QUERY_DESC qDesc;
+	qDesc.Query = D3D11_QUERY_EVENT;
+	qDesc.MiscFlags = 0;
+	deviceData->m_device->CreateQuery( &qDesc, &syncQuery );
+	deviceData->m_context->End( syncQuery );
+	while( deviceData->m_context->GetData( syncQuery, 0,0,0 ) == S_FALSE ){}
+	syncQuery->Release();
+}
+
+int DeviceDX11::getNDevices()
+{
+	IDXGIFactory1* factory = NULL;
+	IDXGIAdapter1* adapter = NULL;
+	CreateDXGIFactory1( __uuidof(IDXGIFactory1), (void**)&factory );
+
+	u32 i = 0;
+	while( factory->EnumAdapters1( i, &adapter ) != DXGI_ERROR_NOT_FOUND )
+	{
+		i++;
+	}
+
+	factory->Release();
+	return i;
+}
+
+void DeviceDX11::getDeviceName( char nameOut[128] ) const
+{
+	IDXGIAdapter* adapter;// = getAdapterFromDevice( this );
+	{
+		IDXGIDevice* pDXGIDevice;
+
+		ADLASSERT( m_device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice) == S_OK );
+		ADLASSERT( pDXGIDevice->GetParent(__uuidof(IDXGIAdapter), (void **)&adapter) == S_OK );
+
+		pDXGIDevice->Release();
+	}
+	DXGI_ADAPTER_DESC adapterDesc;
+	adapter->GetDesc( &adapterDesc );
+
+//	wcstombs( nameOut, adapterDesc.Description, 128 );
+	size_t	i;
+	wcstombs_s( &i, nameOut, 128, adapterDesc.Description, 128 );
+}
+
+Kernel* DeviceDX11::getKernel(const char* fileName, const char* funcName, const char* option, const char* src, bool cacheKernel ) const
+{
+	return m_kernelManager->query( this, fileName, funcName, option, src, cacheKernel );
+}
+
+#undef u32
+
+#undef SAFE_RELEASE
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlKernelUtilsDX11.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlKernelUtilsDX11.inl
@@ -0,0 +1,348 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+namespace adl
+{
+
+#define SAFE_RELEASE(p)      { if(p) { (p)->Release(); (p)=NULL; } }
+
+struct KernelDX11 : public Kernel
+{
+	ID3D11ComputeShader* getKernel() { return (ID3D11ComputeShader*)m_kernel; }
+	ID3D11ComputeShader** getKernelPtr() { return (ID3D11ComputeShader**)&m_kernel; }
+};
+
+
+__inline
+#ifdef UNICODE
+HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) WCHAR* strDestPath,
+                                int cchDest, 
+                                __in LPCWSTR strFilename )
+#else
+HRESULT FindDXSDKShaderFileCch( __in_ecount(cchDest) CHAR* strDestPath,
+                                int cchDest, 
+                                __in LPCSTR strFilename )
+#endif
+{
+    if( NULL == strFilename || strFilename[0] == 0 || NULL == strDestPath || cchDest < 10 )
+        return E_INVALIDARG;
+
+    // Get the exe name, and exe path
+#ifdef UNICODE
+    WCHAR strExePath[MAX_PATH] =
+#else
+    CHAR strExePath[MAX_PATH] =
+#endif
+    {
+        0
+    };
+#ifdef UNICODE
+    WCHAR strExeName[MAX_PATH] =
+#else
+    CHAR strExeName[MAX_PATH] =
+#endif
+    {
+        0
+    };
+#ifdef UNICODE
+    WCHAR* strLastSlash = NULL;
+#else
+    CHAR* strLastSlash = NULL;
+#endif
+    GetModuleFileName( NULL, strExePath, MAX_PATH );
+    strExePath[MAX_PATH - 1] = 0;
+#ifdef UNICODE
+    strLastSlash = wcsrchr( strExePath, TEXT( '\\' ) );
+#else
+    strLastSlash = strrchr( strExePath, TEXT( '\\' ) );
+#endif
+    if( strLastSlash )
+    {
+#ifdef UNICODE
+        wcscpy_s( strExeName, MAX_PATH, &strLastSlash[1] );
+#else
+
+#endif
+        // Chop the exe name from the exe path
+        *strLastSlash = 0;
+
+        // Chop the .exe from the exe name
+#ifdef UNICODE
+        strLastSlash = wcsrchr( strExeName, TEXT( '.' ) );
+#else
+        strLastSlash = strrchr( strExeName, TEXT( '.' ) );
+#endif
+        if( strLastSlash )
+            *strLastSlash = 0;
+    }
+
+    // Search in directories:
+    //      .\
+    //      %EXE_DIR%\..\..\%EXE_NAME%
+#ifdef UNICODE
+    wcscpy_s( strDestPath, cchDest, strFilename );
+#else
+	strcpy_s( strDestPath, cchDest, strFilename );
+#endif
+    if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
+        return S_OK;
+
+//    swprintf_s( strDestPath, cchDest, L"%s\\..\\..\\%s\\%s", strExePath, strExeName, strFilename );
+#ifdef UNICODE
+    swprintf_s( strDestPath, cchDest, L"%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
+#else
+    sprintf_s( strDestPath, cchDest, "%s\\..\\%s\\%s", strExePath, strExeName, strFilename );
+#endif
+    if( GetFileAttributes( strDestPath ) != 0xFFFFFFFF )
+        return S_OK;    
+
+    // On failure, return the file as the path but also return an error code
+#ifdef UNICODE
+    wcscpy_s( strDestPath, cchDest, strFilename );
+#else
+    strcpy_s( strDestPath, cchDest, strFilename );
+#endif
+
+	ADLASSERT( 0 );
+
+    return E_FAIL;
+}
+
+
+
+
+template<>
+void KernelBuilder<TYPE_DX11>::setFromFile( const Device* deviceData, const char* fileName, const char* option, bool addExtension,
+	bool cacheKernel)
+{
+	char fileNameWithExtension[256];
+
+	if( addExtension )
+		sprintf_s( fileNameWithExtension, "%s.hlsl", fileName );
+	else
+		sprintf_s( fileNameWithExtension, "%s", fileName );
+
+	m_deviceData = deviceData;
+
+	int nameLength = (int)strlen(fileNameWithExtension)+1;
+#ifdef UNICODE
+	WCHAR* wfileNameWithExtension = new WCHAR[nameLength];
+#else
+	CHAR* wfileNameWithExtension = new CHAR[nameLength];
+#endif
+	memset(wfileNameWithExtension,0,nameLength);
+#ifdef UNICODE
+	MultiByteToWideChar(CP_ACP,0,fileNameWithExtension,-1, wfileNameWithExtension, nameLength);
+#else
+	sprintf_s(wfileNameWithExtension, nameLength, "%s", fileNameWithExtension);
+#endif
+//			swprintf_s(wfileNameWithExtension, nameLength*2, L"%s", fileNameWithExtension);
+
+	HRESULT hr;
+
+	// Finds the correct path for the shader file.
+	// This is only required for this sample to be run correctly from within the Sample Browser,
+	// in your own projects, these lines could be removed safely
+	hr = FindDXSDKShaderFileCch( m_path, MAX_PATH, wfileNameWithExtension );
+
+	delete [] wfileNameWithExtension;
+
+	ADLASSERT( hr == S_OK );
+}
+
+template<>
+void KernelBuilder<TYPE_DX11>::setFromSrc( const Device* deviceData, const char* src, const char* option )
+{
+	m_deviceData = deviceData;
+	m_ptr = (void*)src;
+	m_path[0] = '0';
+}
+
+template<>
+KernelBuilder<TYPE_DX11>::~KernelBuilder()
+{
+
+}
+
+template<>
+void KernelBuilder<TYPE_DX11>::createKernel( const char* funcName, Kernel& kernelOut )
+{
+	const DeviceDX11* deviceData = (const DeviceDX11*)m_deviceData;
+	KernelDX11* dxKernel = (KernelDX11*)&kernelOut;
+	HRESULT hr;
+
+	DWORD dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
+#if defined( DEBUG ) || defined( _DEBUG )
+	// Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.
+	// Setting this flag improves the shader debugging experience, but still allows 
+	// the shaders to be optimized and to run exactly the way they will run in 
+	// the release configuration of this program.
+	dwShaderFlags |= D3DCOMPILE_DEBUG;
+#endif
+
+	const D3D_SHADER_MACRO defines[] = 
+	{
+#ifdef USE_STRUCTURED_BUFFERS
+		"USE_STRUCTURED_BUFFERS", "1",
+#endif
+
+#ifdef TEST_DOUBLE
+		"TEST_DOUBLE", "1",
+#endif
+		NULL, NULL
+	};
+
+	// We generally prefer to use the higher CS shader profile when possible as CS 5.0 is better performance on 11-class hardware
+	LPCSTR pProfile = ( deviceData->m_device->GetFeatureLevel() >= D3D_FEATURE_LEVEL_11_0 ) ? "cs_5_0" : "cs_4_0";
+
+	ID3DBlob* pErrorBlob = NULL;
+	ID3DBlob* pBlob = NULL;
+	if( m_path[0] == '0' )
+	{
+		char* src = (char*)m_ptr;
+		hr = D3DX11CompileFromMemory( src, strlen(src), 0, defines, NULL, funcName, pProfile, 
+			dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
+	}
+	else
+	{
+		hr = D3DX11CompileFromFile( m_path, defines, NULL, funcName, pProfile, 
+			dwShaderFlags, NULL, NULL, &pBlob, &pErrorBlob, NULL );
+	}
+
+	if ( FAILED(hr) )
+	{
+		debugPrintf("%s", (char*)pErrorBlob->GetBufferPointer());
+	}
+	ADLASSERT( hr == S_OK );
+
+	hr = deviceData->m_device->CreateComputeShader( pBlob->GetBufferPointer(), pBlob->GetBufferSize(), NULL, 
+		dxKernel->getKernelPtr() );
+
+#if defined(DEBUG) || defined(PROFILE)
+	if ( kernelOut.m_kernel )
+		kernelOut.m_kernel->SetPrivateData( WKPDID_D3DDebugObjectName, lstrlenA(pFunctionName), pFunctionName );
+#endif
+
+	SAFE_RELEASE( pErrorBlob );
+	SAFE_RELEASE( pBlob );
+
+	kernelOut.m_type = TYPE_DX11;
+}
+
+template<>
+void KernelBuilder<TYPE_DX11>::deleteKernel( Kernel& kernel )
+{
+	KernelDX11* dxKernel = (KernelDX11*)&kernel;
+
+	if( kernel.m_kernel )
+	{
+		dxKernel->getKernel()->Release();
+		kernel.m_kernel = NULL;
+	}
+}
+
+
+
+class LauncherDX11
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		__inline
+		static void setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n );
+		template<typename T>
+		__inline
+		static void setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts );
+		__inline
+		static void launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY );
+};
+
+void LauncherDX11::setBuffers( Launcher* launcher, BufferInfo* buffInfo, int n )
+{
+	KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
+	const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
+
+	for(int i=0; i<n; i++)
+	{
+		BufferDX11<int>* dBuf = (BufferDX11<int>*)buffInfo[i].m_buffer;
+		if( buffInfo[i].m_isReadOnly )
+		{
+			dddx->m_context->CSSetShaderResources( launcher->m_idx++, 1, dBuf->getSRVPtr() );
+		}
+		else
+		{
+			//	todo. cannot initialize append buffer with proper counter value which is the last arg
+			dddx->m_context->CSSetUnorderedAccessViews( launcher->m_idxRw++, 1, dBuf->getUAVPtr(), 0 );
+		}
+	}
+}
+
+template<typename T>
+void LauncherDX11::setConst( Launcher* launcher, Buffer<T>& constBuff, const T& consts )
+{
+	KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
+	const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
+	BufferDX11<T>* dBuf = (BufferDX11<T>*)&constBuff;
+/*
+    D3D11_MAPPED_SUBRESOURCE MappedResource;
+	dddx->m_context->Map( dBuf->getBuffer(), 0, D3D11_MAP_WRITE_DISCARD, 0, &MappedResource );
+    memcpy( MappedResource.pData, &consts, sizeof(T) );
+	dddx->m_context->Unmap( dBuf->getBuffer(), 0 );
+*/
+
+	dddx->m_context->UpdateSubresource( dBuf->getBuffer(), 0, NULL, &consts, 0, 0 );
+
+	dddx->m_context->CSSetConstantBuffers( 0, 1, dBuf->getBufferPtr() );
+}
+
+void LauncherDX11::launch2D( Launcher* launcher, int numThreadsX, int numThreadsY, int localSizeX, int localSizeY )
+{
+	KernelDX11* dxKernel = (KernelDX11*)launcher->m_kernel;
+	const DeviceDX11* dddx = (const DeviceDX11*)launcher->m_deviceData;
+
+	dddx->m_context->CSSetShader( dxKernel->getKernel(), NULL, 0 );
+
+	int nx, ny, nz;
+	nx = max( 1, (numThreadsX/localSizeX)+(!(numThreadsX%localSizeX)?0:1) );
+	ny = max( 1, (numThreadsY/localSizeY)+(!(numThreadsY%localSizeY)?0:1) );
+	nz = 1;
+
+	dddx->m_context->Dispatch( nx, ny, nz );
+
+	//	set 0 to registers
+	{
+	    dddx->m_context->CSSetShader( NULL, NULL, 0 );
+
+		if( launcher->m_idxRw )
+		{
+			ID3D11UnorderedAccessView* aUAViewsNULL[ 16 ] = { 0 };
+			dddx->m_context->CSSetUnorderedAccessViews( 0, 
+				min( (unsigned int)launcher->m_idxRw, sizeof(aUAViewsNULL)/sizeof(*aUAViewsNULL) ), aUAViewsNULL, NULL );
+		}
+
+		if( launcher->m_idx )
+		{
+			ID3D11ShaderResourceView* ppSRVNULL[16] = { 0 };
+			dddx->m_context->CSSetShaderResources( 0, 
+				min( (unsigned int)launcher->m_idx, sizeof(ppSRVNULL)/sizeof(*ppSRVNULL) ), ppSRVNULL );
+		}
+	}
+}
+
+#undef SAFE_RELEASE
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlStopwatchDX11.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/DX11/AdlStopwatchDX11.inl
@@ -0,0 +1,131 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+namespace adl
+{
+
+struct StopwatchDX11 : public StopwatchBase
+{
+	public:
+		__inline
+		StopwatchDX11() : StopwatchBase(){}
+		__inline
+		~StopwatchDX11();
+
+		__inline
+		void init( const Device* deviceData );
+		__inline
+		void start();
+		__inline
+		void split();
+		__inline
+		void stop();
+		__inline
+		float getMs(int index=0);
+		__inline
+		void getMs( float* times, int capacity );
+
+	public:
+		ID3D11Query* m_tQuery[CAPACITY+1];
+		ID3D11Query* m_fQuery;
+		UINT64 m_t[CAPACITY];
+};
+
+void StopwatchDX11::init( const Device* deviceData )
+{
+	ADLASSERT( deviceData->m_type == TYPE_DX11 );
+	m_device = deviceData;
+	{
+		D3D11_QUERY_DESC qDesc;
+		qDesc.Query = D3D11_QUERY_TIMESTAMP_DISJOINT;
+		qDesc.MiscFlags = 0;
+		((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_fQuery );
+	}
+	for(int i=0; i<CAPACITY+1; i++)
+	{
+		D3D11_QUERY_DESC qDesc;
+		qDesc.Query = D3D11_QUERY_TIMESTAMP;
+		qDesc.MiscFlags = 0;
+		((const DeviceDX11*)m_device)->m_device->CreateQuery( &qDesc, &m_tQuery[i] );
+	}
+}
+
+StopwatchDX11::~StopwatchDX11()
+{
+	m_fQuery->Release();
+	for(int i=0; i<CAPACITY+1; i++)
+	{
+		m_tQuery[i]->Release();
+	}
+}
+
+void StopwatchDX11::start()
+{
+	m_idx = 0;
+	((const DeviceDX11*)m_device)->m_context->Begin( m_fQuery );
+	((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
+}
+
+void StopwatchDX11::split()
+{
+	if( m_idx < CAPACITY )
+		((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
+}
+
+void StopwatchDX11::stop()
+{
+	((const DeviceDX11*)m_device)->m_context->End( m_tQuery[m_idx++] );
+	((const DeviceDX11*)m_device)->m_context->End( m_fQuery );
+}
+
+float StopwatchDX11::getMs(int index)
+{
+	D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
+//	m_deviceData->m_context->End( m_fQuery );
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
+
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[0], &m_t[index],sizeof(UINT64),0 ) == S_FALSE ){}
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[1], &m_t[index+1],sizeof(UINT64),0 ) == S_FALSE ){}
+
+	ADLASSERT( d.Disjoint == false );
+
+	float elapsedMs = (m_t[index+1] - m_t[index])/(float)d.Frequency*1000;
+	return elapsedMs;
+
+}
+
+void StopwatchDX11::getMs( float* times, int capacity )
+{
+	ADLASSERT( capacity <= CAPACITY );
+
+	D3D11_QUERY_DATA_TIMESTAMP_DISJOINT d;
+	while( ((const DeviceDX11*)m_device)->m_context->GetData( m_fQuery, &d,sizeof(D3D11_QUERY_DATA_TIMESTAMP_DISJOINT),0 ) == S_FALSE ) {}
+
+	for(int i=0; i<m_idx; i++)
+	{
+		while( ((const DeviceDX11*)m_device)->m_context->GetData( m_tQuery[i], &m_t[i],sizeof(UINT64),0 ) == S_FALSE ){}
+	}
+
+	ADLASSERT( d.Disjoint == false );
+
+	for(int i=0; i<capacity; i++)
+	{
+		times[i] = (m_t[i+1] - m_t[i])/(float)d.Frequency*1000;
+	}
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlHost.inl
@@ -0,0 +1,107 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+namespace adl
+{
+
+struct DeviceHost : public Device
+{
+	DeviceHost() : Device( TYPE_HOST ){}
+
+	__inline
+	void initialize(const Config& cfg);
+	__inline
+	void release();
+
+	template<typename T>
+	__inline
+	void allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type);
+
+	template<typename T>
+	__inline
+	void deallocate(Buffer<T>* buf);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const Buffer<T>* src, int nElems);
+
+	template<typename T>
+	__inline
+	void copy(T* dst, const Buffer<T>* src, int nElems, int offsetNElems = 0);
+
+	template<typename T>
+	__inline
+	void copy(Buffer<T>* dst, const T* src, int nElems, int offsetNElems = 0);
+
+	__inline
+	void waitForCompletion() const;
+};
+
+void DeviceHost::initialize(const Config& cfg)
+{
+
+}
+
+void DeviceHost::release()
+{
+
+}
+
+template<typename T>
+void DeviceHost::allocate(Buffer<T>* buf, int nElems, BufferBase::BufferType type)
+{
+	buf->m_device = this;
+
+	if( type == BufferBase::BUFFER_CONST ) return;
+
+	buf->m_ptr = new T[nElems];
+	ADLASSERT( buf->m_ptr );
+	buf->m_size = nElems;
+}
+
+template<typename T>
+void DeviceHost::deallocate(Buffer<T>* buf)
+{
+	if( buf->m_ptr ) delete [] buf->m_ptr;
+}
+
+template<typename T>
+void DeviceHost::copy(Buffer<T>* dst, const Buffer<T>* src, int nElems)
+{
+	copy( dst, src->m_ptr, nElems );
+}
+
+template<typename T>
+void DeviceHost::copy(T* dst, const Buffer<T>* src, int nElems, int srcOffsetNElems)
+{
+	ADLASSERT( src->getType() == TYPE_HOST );
+	memcpy( dst, src->m_ptr+srcOffsetNElems, nElems*sizeof(T) );
+}
+
+template<typename T>
+void DeviceHost::copy(Buffer<T>* dst, const T* src, int nElems, int dstOffsetNElems)
+{
+	ADLASSERT( dst->getType() == TYPE_HOST );
+	memcpy( dst->m_ptr+dstOffsetNElems, src, nElems*sizeof(T) );
+}
+
+void DeviceHost::waitForCompletion() const
+{
+
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlStopwatchHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/Adl/Host/AdlStopwatchHost.inl
@@ -0,0 +1,119 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#ifdef _WIN32
+	#include <windows.h>
+#else
+	#include <sys/time.h>
+#endif
+
+namespace adl
+{
+
+class StopwatchHost : public StopwatchBase
+{
+	public:
+		__inline
+		StopwatchHost();
+		__inline
+		void init( const Device* deviceData );
+		__inline
+		void start();
+		__inline
+		void split();
+		__inline
+		void stop();
+		__inline
+		float getMs(int index=0);
+		__inline
+		void getMs( float* times, int capacity );
+
+	private:
+#ifdef _WIN32
+		LARGE_INTEGER m_frequency;
+		LARGE_INTEGER m_t[CAPACITY];
+#else
+		struct timeval mStartTime;
+		timeval m_t[CAPACITY];
+#endif
+};
+
+__inline
+StopwatchHost::StopwatchHost()
+ : StopwatchBase()
+{
+}
+
+__inline
+void StopwatchHost::init( const Device* deviceData )
+{
+	m_device = deviceData;
+#ifdef _WIN32
+	QueryPerformanceFrequency( &m_frequency );
+#else
+	gettimeofday(&mStartTime, 0);
+#endif
+}
+
+__inline
+void StopwatchHost::start()
+{
+	m_idx = 0;
+#ifdef _WIN32
+	QueryPerformanceCounter(&m_t[m_idx++]);
+#else
+	gettimeofday(&m_t[m_idx++], 0);
+#endif
+}
+
+__inline
+void StopwatchHost::split()
+{
+#ifdef _WIN32
+	QueryPerformanceCounter(&m_t[m_idx++]);
+#else
+	gettimeofday(&m_t[m_idx++], 0);
+#endif
+}
+
+__inline
+void StopwatchHost::stop()
+{
+	split();
+}
+
+__inline
+float StopwatchHost::getMs(int index)
+{
+#ifdef _WIN32
+	return (float)(1000*(m_t[index+1].QuadPart - m_t[index].QuadPart))/m_frequency.QuadPart;
+#else
+		return (m_t[index+1].tv_sec - m_t[index].tv_sec) * 1000 + 
+			(m_t[index+1].tv_usec - m_t[index].tv_usec) / 1000;
+#endif
+}
+
+__inline
+void StopwatchHost::getMs(float* times, int capacity)
+{
+	for(int i=0; i<capacity; i++) times[i] = 0.f;
+
+	for(int i=0; i<min(capacity, m_idx-1); i++)
+	{
+		times[i] = getMs(i);
+	}
+}
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class CopyBase
+{
+	public:
+		enum Option
+		{
+			PER_WI_1, 
+			PER_WI_2, 
+			PER_WI_4, 
+		};
+};
+
+template<DeviceType TYPE>
+class Copy : public CopyBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_copy1F4Kernel;
+			Kernel* m_copy2F4Kernel;
+			Kernel* m_copy4F4Kernel;
+			Kernel* m_copyF1Kernel;
+			Kernel* m_copyF2Kernel;
+			Buffer<int4>* m_constBuffer;
+		};
+
+		static
+		Data* allocate(const Device* deviceData);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1);
+
+		static
+		void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n);
+
+		static
+		void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n);
+};
+
+
+#include <AdlPrimitives/Copy/CopyHost.inl>
+#include <AdlPrimitives/Copy/Copy.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
@@ -0,0 +1,151 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Copy\\CopyKernels"
+#define KERNEL0 "Copy1F4Kernel"
+#define KERNEL1 "Copy2F4Kernel"
+#define KERNEL2 "Copy4F4Kernel"
+#define KERNEL3 "CopyF1Kernel"
+#define KERNEL4 "CopyF2Kernel"
+
+#include <AdlPrimitives/Copy/CopyKernelsCL.h>
+#include <AdlPrimitives/Copy/CopyKernelsDX11.h>
+
+
+template<DeviceType TYPE>
+typename Copy<TYPE>::Data* Copy<TYPE>::allocate( const Device* device )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{copyKernelsCL, copyKernelsDX11};
+//	ADLASSERT(0);
+#else
+	{0,0};
+#endif	
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_copy1F4Kernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_copy2F4Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_copy4F4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	data->m_copyF1Kernel = device->getKernel( PATH, KERNEL3, 0, src[TYPE] );
+	data->m_copyF2Kernel = device->getKernel( PATH, KERNEL4, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::deallocate( Data* data )
+{
+	delete data->m_constBuffer;
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	switch (option)
+	{
+	case PER_WI_1:
+		{
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy1F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/1 );
+		}
+		break;
+	case PER_WI_2:
+		{
+			ADLASSERT( n%2 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy2F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/2 );
+		}
+		break;
+	case PER_WI_4:
+		{
+			ADLASSERT( n%4 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy4F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/4 );
+		}
+		break;
+	default:
+		ADLASSERT(0);
+		break;
+	};
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF2Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF1Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+#undef KERNEL3
+#undef KERNEL4
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1,5 @@`
				`stringify.py global_atomics.cl globalAtomicsKernelString >globalAtomicsKernel.h`