Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cl
@@ -0,0 +1,349 @@
+
+MSTRINGIFY(
+
+int getPosHash(int4 gridPos, __global float4* pParams)
+{
+	int4 gridDim = *((__global int4*)(pParams + 1));
+	gridPos.x &= gridDim.x - 1;
+	gridPos.y &= gridDim.y - 1;
+	gridPos.z &= gridDim.z - 1;
+	int hash = gridPos.z * gridDim.y * gridDim.x + gridPos.y * gridDim.x + gridPos.x;
+	return hash;
+} 
+
+int4 getGridPos(float4 worldPos, __global float4* pParams)
+{
+    int4 gridPos;
+	int4 gridDim = *((__global int4*)(pParams + 1));
+    gridPos.x = (int)floor(worldPos.x * pParams[0].x) & (gridDim.x - 1);
+    gridPos.y = (int)floor(worldPos.y * pParams[0].y) & (gridDim.y - 1);
+    gridPos.z = (int)floor(worldPos.z * pParams[0].z) & (gridDim.z - 1);
+    return gridPos;
+}
+
+
+// calculate grid hash value for each body using its AABB
+__kernel void kCalcHashAABB(int numObjects, __global float4* pAABB, __global int2* pHash, __global float4* pParams GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index*2];
+	float4 bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+	pos.w = 0.f;
+    // get address in grid
+    int4 gridPos = getGridPos(pos, pParams);
+    int gridHash = getPosHash(gridPos, pParams);
+    // store grid hash and body index
+    int2 hashVal;
+    hashVal.x = gridHash;
+    hashVal.y = index;
+    pHash[index] = hashVal;
+}
+
+__kernel void kClearCellStart(	int numCells, 
+								__global int* pCellStart GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numCells)
+	{
+		return;
+	}
+	pCellStart[index] = -1;
+}
+
+__kernel void kFindCellStart(int numObjects, __global int2* pHash, __global int* cellStart GUID_ARG)
+{
+	__local int sharedHash[513];
+    int index = get_global_id(0);
+	int2 sortedData;
+    if(index < numObjects)
+	{
+		sortedData = pHash[index];
+		// Load hash data into shared memory so that we can look 
+		// at neighboring body's hash value without loading
+		// two hash values per thread
+		sharedHash[get_local_id(0) + 1] = sortedData.x;
+		if((index > 0) && (get_local_id(0) == 0))
+		{
+			// first thread in block must load neighbor body hash
+			sharedHash[0] = pHash[index-1].x;
+		}
+	}
+    barrier(CLK_LOCAL_MEM_FENCE);
+    if(index < numObjects)
+	{
+		if((index == 0) || (sortedData.x != sharedHash[get_local_id(0)]))
+		{
+			cellStart[sortedData.x] = index;
+		}
+	}
+}
+
+int testAABBOverlap(float4 min0, float4 max0, float4 min1, float4 max1)
+{
+	return	(min0.x <= max1.x)&& (min1.x <= max0.x) && 
+			(min0.y <= max1.y)&& (min1.y <= max0.y) && 
+			(min0.z <= max1.z)&& (min1.z <= max0.z); 
+}
+
+
+
+
+
+void findPairsInCell(	int numObjects,
+						int4	gridPos,
+						int    index,
+						__global int2*  pHash,
+						__global int*   pCellStart,
+						__global float4* pAABB, 
+						__global int*   pPairBuff,
+						__global int2*	pPairBuffStartCurr,
+						__global float4* pParams)
+{
+	int4 pGridDim = *((__global int4*)(pParams + 1));
+	int maxBodiesPerCell = pGridDim.w;
+    int gridHash = getPosHash(gridPos, pParams);
+    // get start of bucket for this cell
+    int bucketStart = pCellStart[gridHash];
+    if (bucketStart == -1)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+    float4 min0 = pAABB[unsorted_indx*2 + 0]; 
+	float4 max0 = pAABB[unsorted_indx*2 + 1];
+	int handleIndex =  as_int(min0.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	int curr_max = start_curr_next.x - start - 1;
+	int bucketEnd = bucketStart + maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numObjects) ? numObjects : bucketEnd;
+	for(int index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        int2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		int unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			float4 min1 = pAABB[unsorted_indx2*2 + 0];
+			float4 max1 = pAABB[unsorted_indx2*2 + 1];
+			if(testAABBOverlap(min0, max0, min1, max1))
+			{
+				int handleIndex2 = as_int(min1.w);
+				int k;
+				for(k = 0; k < curr; k++)
+				{
+					int old_pair = pPairBuff[start+k] & (~0x60000000);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= 0x40000000;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					pPairBuff[start+curr] = handleIndex2 | 0x20000000;
+					curr++;
+				}
+			}
+		}
+	}
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = curr;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+    return;
+}
+
+__kernel void kFindOverlappingPairs(	int numObjects,
+										__global float4* pAABB, 
+										__global int2* pHash, 
+										__global int* pCellStart, 
+										__global int* pPairBuff, 
+										__global int2* pPairBuffStartCurr, 
+										__global float4* pParams GUID_ARG)
+
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+	float4 bbMin = pAABB[unsorted_indx*2 + 0];
+	float4 bbMax = pAABB[unsorted_indx*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.x + bbMax.x) * 0.5f;
+	pos.y = (bbMin.y + bbMax.y) * 0.5f;
+	pos.z = (bbMin.z + bbMax.z) * 0.5f;
+    // get address in grid
+    int4 gridPosA = getGridPos(pos, pParams);
+    int4 gridPosB; 
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) 
+    {
+		gridPosB.z = gridPosA.z + z;
+        for(int y=-1; y<=1; y++) 
+        {
+			gridPosB.y = gridPosA.y + y;
+            for(int x=-1; x<=1; x++) 
+            {
+				gridPosB.x = gridPosA.x + x;
+                findPairsInCell(numObjects, gridPosB, index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, pParams);
+            }
+        }
+    }
+}
+
+
+__kernel void kFindPairsLarge(	int numObjects, 
+								__global float4* pAABB, 
+								__global int2* pHash, 
+								__global int* pCellStart, 
+								__global int* pPairBuff, 
+								__global int2* pPairBuffStartCurr, 
+								uint numLarge GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+    int2 sortedData = pHash[index];
+	int unsorted_indx = sortedData.y;
+	float4 min0 = pAABB[unsorted_indx*2 + 0];
+	float4 max0 = pAABB[unsorted_indx*2 + 1];
+	int handleIndex =  as_int(min0.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	int2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	int curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		int indx2 = numObjects + i;
+		float4 min1 = pAABB[indx2*2 + 0];
+		float4 max1 = pAABB[indx2*2 + 1];
+		if(testAABBOverlap(min0, max0, min1, max1))
+		{
+			int k;
+			int handleIndex2 =  as_int(min1.w);
+			for(k = 0; k < curr; k++)
+			{
+				int old_pair = pPairBuff[start+k] & (~0x60000000);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= 0x40000000;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | 0x20000000;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = curr;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+    return;
+}
+
+__kernel void kComputePairCacheChanges(	int numObjects,
+										__global int* pPairBuff, 
+										__global int2* pPairBuffStartCurr, 
+										__global int* pPairScan, 
+										__global float4* pAABB GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int *pInp = pPairBuff + start;
+	int num_changes = 0;
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & 0x40000000))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} 
+
+__kernel void kSqueezeOverlappingPairBuff(	int numObjects,
+											__global int* pPairBuff, 
+											__global int2* pPairBuffStartCurr, 
+											__global int* pPairScan,
+											__global int* pPairOut, 
+											__global float4* pAABB GUID_ARG)
+{
+    int index = get_global_id(0);
+    if(index >= numObjects)
+	{
+		return;
+	}
+	float4 bbMin = pAABB[index * 2];
+	int handleIndex = as_int(bbMin.w);
+	int2 start_curr = pPairBuffStartCurr[handleIndex];
+	int start = start_curr.x;
+	int curr = start_curr.y;
+	__global int* pInp = pPairBuff + start;
+	__global int* pOut = pPairOut + pPairScan[index+1];
+	__global int* pOut2 = pInp;
+	int num = 0; 
+	for(int k = 0; k < curr; k++, pInp++)
+	{
+		if(!((*pInp) & 0x40000000))
+		{
+			*pOut = *pInp;
+			pOut++;
+		}
+		if((*pInp) & 0x60000000)
+		{
+			*pOut2 = (*pInp) & (~0x60000000);
+			pOut2++;
+			num++;
+		}
+	}
+	int2 newStartCurr;
+	newStartCurr.x = start;
+	newStartCurr.y = num;
+	pPairBuffStartCurr[handleIndex] = newStartCurr;
+}
+
+
+
+
+);
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.cpp
@@ -0,0 +1,697 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+#include "../basic_initialize/btOpenCLUtils.h"
+
+#include "bt3dGridBroadphaseOCL.h"
+
+#include <stdio.h>
+#include <string.h>
+#include "Adl/Adl.h"
+#include <AdlPrimitives/Scan/PrefixScan.h>
+#include <AdlPrimitives/Sort/RadixSort32.h>
+#include <AdlPrimitives/Sort/RadixSort.h>
+
+#define ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+
+#define GRID_OCL_PATH "..\\..\\opencl\\3dGridBroadphase\\Shared\\bt3dGridBroadphaseOCL.cl"
+
+
+#define MSTRINGIFY(A) #A
+
+static const char* spProgramSource = 
+#include "bt3dGridBroadphaseOCL.cl"
+
+adl::PrefixScan<adl::TYPE_CL>::Data* gData1=0;
+adl::Buffer<unsigned int>* m_srcClBuffer=0;
+
+struct MySortData
+{
+	int key;
+	int value;
+};
+
+adl::RadixSort32<adl::TYPE_CL>::Data* dataC = 0;
+adl::RadixSort<adl::TYPE_HOST>::Data* dataHost = 0;
+
+
+static unsigned int infElem = 0x2fffffff;
+
+static unsigned int zeroEl = 0;
+static unsigned int minusOne= -1;
+
+
+bt3dGridBroadphaseOCL::bt3dGridBroadphaseOCL(	btOverlappingPairCache* overlappingPairCache,
+												const btVector3& cellSize, 
+												int gridSizeX, int gridSizeY, int gridSizeZ, 
+												int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+												btScalar maxSmallProxySize,
+												int maxSmallProxiesPerCell,
+												cl_context context, cl_device_id device, cl_command_queue queue,
+												adl::DeviceCL* deviceCL
+												) : 
+	btGpu3DGridBroadphase(overlappingPairCache, cellSize, gridSizeX, gridSizeY, gridSizeZ, maxSmallProxies, maxLargeProxies, maxPairsPerSmallProxy, maxSmallProxySize, maxSmallProxiesPerCell)
+{
+
+
+	initCL(context, device, queue);
+	allocateBuffers();
+	
+	prefillBuffers();
+
+	initKernels();
+
+	//create an Adl device host and OpenCL device
+
+	adl::DeviceUtils::Config cfg;
+	m_deviceHost = adl::DeviceUtils::allocate( adl::TYPE_HOST, cfg );
+	m_ownsDevice = false;
+	if (!deviceCL)
+	{
+		m_ownsDevice = true;
+		deviceCL = new adl::DeviceCL;
+		deviceCL->m_context = context;
+		deviceCL->m_deviceIdx = device;
+		deviceCL->m_commandQueue = queue;
+		deviceCL->m_kernelManager = new adl::KernelManager;
+	}
+
+	m_deviceCL = deviceCL;
+
+	int minSize = 256*1024;
+	int maxSortBuffer = maxSmallProxies < minSize ? minSize :maxSmallProxies;
+
+	m_srcClBuffer = new adl::Buffer<unsigned int> (m_deviceCL,maxSmallProxies+2);
+	m_srcClBuffer->write(&zeroEl,1,0);
+
+	//m_srcClBuffer->write(&infElem,maxSmallProxies,0);
+	m_srcClBuffer->write(&infElem,1,maxSmallProxies);
+	m_srcClBuffer->write(&zeroEl,1,maxSmallProxies+1);
+	m_deviceCL->waitForCompletion();
+	
+	gData1 = adl::PrefixScan<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2,adl::PrefixScanBase::EXCLUSIVE );
+	dataHost = adl::RadixSort<adl::TYPE_HOST>::allocate( m_deviceHost, maxSmallProxies+2 );
+	dataC = adl::RadixSort32<adl::TYPE_CL>::allocate( m_deviceCL, maxSortBuffer+2 );
+	
+}
+
+
+
+bt3dGridBroadphaseOCL::~bt3dGridBroadphaseOCL()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	adl::RadixSort<adl::TYPE_HOST>::deallocate(dataHost);
+	adl::PrefixScan<adl::TYPE_CL>::deallocate(gData1);
+	adl::RadixSort32<adl::TYPE_CL>::deallocate(dataC);
+	adl::DeviceUtils::deallocate(m_deviceHost);
+	delete m_srcClBuffer;
+	if (m_ownsDevice)
+	{
+		delete m_deviceCL->m_kernelManager;
+		delete m_deviceCL;
+	}
+}
+
+#ifdef CL_PLATFORM_MINI_CL
+// there is a problem with MSVC9 : static constructors are not called if variables defined in library and are not used
+// looks like it is because of optimization
+// probably this will happen with other compilers as well
+// so to make it robust, register kernels again (it is safe)
+#define MINICL_DECLARE(a) extern "C" void a();
+MINICL_DECLARE(kCalcHashAABB)
+MINICL_DECLARE(kClearCellStart)
+MINICL_DECLARE(kFindCellStart)
+MINICL_DECLARE(kFindOverlappingPairs)
+MINICL_DECLARE(kFindPairsLarge)
+MINICL_DECLARE(kComputePairCacheChanges)
+MINICL_DECLARE(kSqueezeOverlappingPairBuff)
+#undef MINICL_DECLARE
+#endif
+
+void bt3dGridBroadphaseOCL::initCL(cl_context context, cl_device_id device, cl_command_queue queue)
+{
+
+	#ifdef CL_PLATFORM_MINI_CL
+		// call constructors here
+		MINICL_REGISTER(kCalcHashAABB)
+		MINICL_REGISTER(kClearCellStart)
+		MINICL_REGISTER(kFindCellStart)
+		MINICL_REGISTER(kFindOverlappingPairs)
+		MINICL_REGISTER(kFindPairsLarge)
+		MINICL_REGISTER(kComputePairCacheChanges)
+		MINICL_REGISTER(kSqueezeOverlappingPairBuff)
+	#endif
+
+	cl_int ciErrNum;
+
+	btAssert(context);
+	m_cxMainContext = context;
+	btAssert(device);
+	m_cdDevice = device;
+	btAssert(queue);
+	m_cqCommandQue = queue;
+	
+	//adl::Kernel kern = m_deviceCL->getKernel(fileName,funcName,options,src);
+	
+	m_cpProgram = btOpenCLUtils::compileCLProgramFromString(m_cxMainContext,m_cdDevice,spProgramSource, &ciErrNum,"-DGUID_ARG=""""",GRID_OCL_PATH);
+	
+	printf("OK\n");
+}
+
+
+void bt3dGridBroadphaseOCL::initKernels()
+{
+	initKernel(GRID3DOCL_KERNEL_CALC_HASH_AABB,	"kCalcHashAABB");
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_CALC_HASH_AABB, 3, sizeof(cl_mem),(void*)&m_dBpParams);
+
+	initKernel(GRID3DOCL_KERNEL_CLEAR_CELL_START, "kClearCellStart");
+	setKernelArg(GRID3DOCL_KERNEL_CLEAR_CELL_START, 1, sizeof(cl_mem),(void*)&m_dCellStart);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_CELL_START, "kFindCellStart");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 1, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_CELL_START, 2, sizeof(cl_mem),(void*)&m_dCellStart);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, "kFindOverlappingPairs");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 3, sizeof(cl_mem),(void*)&m_dCellStart);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, 6, sizeof(cl_mem),(void*)&m_dBpParams);
+
+	initKernel(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, "kFindPairsLarge");
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 1, sizeof(cl_mem),(void*)&m_dAABB);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 2, sizeof(cl_mem),(void*)&m_dBodiesHash);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 3, sizeof(cl_mem),(void*)&m_dCellStart);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 4, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 5, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+
+	initKernel(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, "kComputePairCacheChanges");
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+	setKernelArg(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, 4, sizeof(cl_mem),(void*)&m_dAABB);
+
+	initKernel(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, "kSqueezeOverlappingPairBuff");
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 1, sizeof(cl_mem),(void*)&m_dPairBuff);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 2, sizeof(cl_mem),(void*)&m_dPairBuffStartCurr);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 3, sizeof(cl_mem),(void*)&m_dPairScanChanged);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 4, sizeof(cl_mem),(void*)&m_dPairsChanged);
+	setKernelArg(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, 5, sizeof(cl_mem),(void*)&m_dAABB);
+
+}
+
+
+void bt3dGridBroadphaseOCL::allocateBuffers()
+{
+    cl_int ciErrNum;
+    unsigned int memSize;
+	// current version of bitonic sort works for power of 2 arrays only, so ...
+	m_hashSize = 1;
+	for(int bit = 1; bit < 32; bit++)
+	{
+		if(m_hashSize >= m_maxHandles)
+		{
+			break;
+		}
+		m_hashSize <<= 1;
+	}
+	memSize = m_hashSize * 2 * sizeof(unsigned int);
+	if (memSize < 1024*1024)
+		memSize = 1024*1024;
+
+	m_dBodiesHash = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_numCells * sizeof(unsigned int);
+	m_dCellStart = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
+	m_dPairBuff = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = (m_maxHandles * 2 + 1) * sizeof(unsigned int);
+	m_dPairBuffStartCurr = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	memSize = numAABB * sizeof(float) * 4 * 2;
+	m_dAABB = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = (m_maxHandles + 2) * sizeof(unsigned int);
+	m_dPairScanChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int);
+	m_dPairsChanged = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	m_dPairsContiguous = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+
+	memSize = 3 * 4 * sizeof(float);
+	m_dBpParams = clCreateBuffer(m_cxMainContext, CL_MEM_READ_WRITE, memSize, NULL, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+void bt3dGridBroadphaseOCL::prefillBuffers()
+{
+	memset(m_hBodiesHash, 0xFF, m_maxHandles*2*sizeof(unsigned int));
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_maxHandles * 2 * sizeof(unsigned int));
+	// now fill the rest (bitonic sorting works with size == pow of 2)
+	int remainder = m_hashSize - m_maxHandles;
+	if(remainder)
+	{
+		copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, remainder * 2 * sizeof(unsigned int), m_maxHandles * 2 * sizeof(unsigned int), 0);
+	}
+	copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); 
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+	copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+}
+
+
+void bt3dGridBroadphaseOCL::initKernel(int kernelId, char* pName)
+{
+	
+	cl_int ciErrNum;
+	cl_kernel kernel = clCreateKernel(m_cpProgram, pName, &ciErrNum);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	size_t wgSize;
+	ciErrNum = clGetKernelWorkGroupInfo(kernel, m_cdDevice, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	m_kernels[kernelId].m_Id = kernelId;
+	m_kernels[kernelId].m_kernel = kernel;
+	m_kernels[kernelId].m_name = pName;
+	m_kernels[kernelId].m_workgroupSize = (int)wgSize;
+	return;
+}
+
+void bt3dGridBroadphaseOCL::runKernelWithWorkgroupSize(int kernelId, int globalSize)
+{
+	if(globalSize <= 0)
+	{
+		return;
+	}
+	cl_kernel kernelFunc = m_kernels[kernelId].m_kernel;
+	cl_int ciErrNum = clSetKernelArg(kernelFunc, 0, sizeof(int), (void*)&globalSize);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	int workgroupSize = btMin(64,m_kernels[kernelId].m_workgroupSize);
+
+	if(workgroupSize <= 0)
+	{ // let OpenCL library calculate workgroup size
+		size_t globalWorkSize[2];
+		globalWorkSize[0] = globalSize;
+		globalWorkSize[1] = 1;
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, NULL, 0,0,0 );
+	}
+	else
+	{
+		size_t localWorkSize[2], globalWorkSize[2];
+		//workgroupSize = btMin(workgroupSize, globalSize);
+		int num_t = globalSize / workgroupSize;
+		int num_g = num_t * workgroupSize;
+		if(num_g < globalSize)
+		{
+			num_t++;
+		}
+		localWorkSize[0]  = workgroupSize;
+		globalWorkSize[0] = num_t * workgroupSize;
+		localWorkSize[1] = 1;
+		globalWorkSize[1] = 1;
+		ciErrNum = clEnqueueNDRangeKernel(m_cqCommandQue, kernelFunc, 1, NULL, globalWorkSize, localWorkSize, 0,0,0 );
+	}
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	ciErrNum = clFlush(m_cqCommandQue);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+
+void bt3dGridBroadphaseOCL::setKernelArg(int kernelId, int argNum, int argSize, void* argPtr)
+{
+    cl_int ciErrNum;
+	ciErrNum  = clSetKernelArg(m_kernels[kernelId].m_kernel, argNum, argSize, argPtr);
+	GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+}
+
+
+void bt3dGridBroadphaseOCL::copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs, int hostOffs)
+{
+	if (size)
+	{
+		cl_int ciErrNum;
+		char* pHost = (char*)host + hostOffs;
+		ciErrNum = clEnqueueWriteBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
+		GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+}
+
+void bt3dGridBroadphaseOCL::copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs, int devOffs)
+{
+	if (size)
+    {
+		cl_int ciErrNum;
+		char* pHost = (char*)host + hostOffs;
+		ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, device, CL_TRUE, devOffs, size, pHost, 0, NULL, NULL);
+		GRID3DOCL_CHECKERROR(ciErrNum, CL_SUCCESS);
+	}
+}
+
+
+
+//
+// overrides
+//
+
+
+void bt3dGridBroadphaseOCL::prepareAABB()
+{
+	btGpu3DGridBroadphase::prepareAABB();
+	copyArrayToDevice(m_dAABB, m_hAABB, sizeof(bt3DGrid3F1U) * 2 * (m_numHandles + m_numLargeHandles)); 
+	return;
+}
+
+void bt3dGridBroadphaseOCL::setParameters(bt3DGridBroadphaseParams* hostParams)
+{
+	btGpu3DGridBroadphase::setParameters(hostParams);
+	struct btParamsBpOCL
+	{
+		float m_invCellSize[4];
+		int   m_gridSize[4];
+	};
+	btParamsBpOCL hParams;
+	hParams.m_invCellSize[0] = m_params.m_invCellSizeX;
+	hParams.m_invCellSize[1] = m_params.m_invCellSizeY;
+	hParams.m_invCellSize[2] = m_params.m_invCellSizeZ;
+	hParams.m_invCellSize[3] = 0.f;
+	hParams.m_gridSize[0] = m_params.m_gridSizeX;
+	hParams.m_gridSize[1] = m_params.m_gridSizeY;
+	hParams.m_gridSize[2] = m_params.m_gridSizeZ;
+	hParams.m_gridSize[3] = m_params.m_maxBodiesPerCell;
+	copyArrayToDevice(m_dBpParams, &hParams, sizeof(btParamsBpOCL));
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::calcHashAABB()
+{
+	BT_PROFILE("calcHashAABB");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CALC_HASH_AABB, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+
+#else
+	btGpu3DGridBroadphase::calcHashAABB();
+#endif
+	
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::sortHash()
+{
+	BT_PROFILE("sortHash");
+#ifdef CL_PLATFORM_MINI_CL
+	//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+	btGpu3DGridBroadphase::sortHash();
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#else
+	
+//#define USE_HOST
+#ifdef USE_HOST
+	copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+	//adl::Buffer<unsigned int> keysIn,keysOut,valuesIn,valuesOut;
+	///adl::RadixSort32<adl::TYPE_CL>::execute(dataC,keysIn,keysOut,valuesIn,valuesOut,m_numHandles);
+	adl::HostBuffer<adl::SortData> inoutHost;
+	inoutHost.m_device = m_deviceHost;
+	inoutHost.m_ptr = (adl::SortData*)m_hBodiesHash;
+	inoutHost.m_size = m_numHandles;
+	adl::RadixSort<adl::TYPE_HOST>::execute(dataHost, inoutHost,m_numHandles);
+	copyArrayToDevice(m_dBodiesHash, m_hBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#else
+	{
+	clFinish(m_cqCommandQue);
+	BT_PROFILE("RadixSort32::execute");
+	adl::Buffer<adl::SortData> inout;
+	inout.m_device = this->m_deviceCL;
+	inout.m_size = m_numHandles;
+	inout.m_ptr = (adl::SortData*)m_dBodiesHash;
+	int actualHandles = m_numHandles;
+	int dataAlignment = adl::RadixSort32<adl::TYPE_CL>::DATA_ALIGNMENT;
+
+	if (actualHandles%dataAlignment)
+	{
+		actualHandles += dataAlignment-(actualHandles%dataAlignment);
+	}
+
+	adl::RadixSort32<adl::TYPE_CL>::execute(dataC,inout, actualHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	}
+	{
+		//BT_PROFILE("copyArrayFromDevice");
+	//copyArrayFromDevice(m_hBodiesHash, m_dBodiesHash, m_numHandles * 2 * sizeof(unsigned int));
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif //ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	}
+
+
+#endif //USE_HOST
+#endif
+
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::findCellStart()
+{
+#if 1
+	BT_PROFILE("findCellStart");
+		
+	#if defined(CL_PLATFORM_MINI_CL)
+		btGpu3DGridBroadphase::findCellStart();
+		copyArrayToDevice(m_dCellStart, m_hCellStart, m_numCells * sizeof(unsigned int));
+	#else
+			runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_CLEAR_CELL_START, m_numCells);	
+			runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_CELL_START, m_numHandles);
+	#endif
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findCellStart();
+#endif
+
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::findOverlappingPairs()
+{
+#if 1
+	BT_PROFILE("findOverlappingPairs");
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findOverlappingPairs();
+	copyArrayToDevice(m_dPairBuffStartCurr, m_hPairBuffStartCurr, (m_maxHandles * 2 + 1) * sizeof(unsigned int)); 
+	copyArrayToDevice(m_dPairBuff, m_hPairBuff, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int));
+#endif
+	return;
+}
+
+
+void bt3dGridBroadphaseOCL::findPairsLarge()
+{
+	BT_PROFILE("findPairsLarge");
+#if 1
+	if(m_numLargeHandles)
+	{
+		setKernelArg(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, 6, sizeof(int),(void*)&m_numLargeHandles);
+		runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_FIND_PAIRS_LARGE, m_numHandles);
+	}
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::findPairsLarge();
+#endif
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::computePairCacheChanges()
+{
+	BT_PROFILE("computePairCacheChanges");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES, m_numHandles);
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+	copyArrayFromDevice( m_hPairScanChanged,m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+
+#else
+	btGpu3DGridBroadphase::computePairCacheChanges();
+	copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+	
+
+#endif
+	return;
+}
+
+
+
+
+extern cl_device_type deviceType;
+
+void bt3dGridBroadphaseOCL::scanOverlappingPairBuff(bool copyToCpu)
+{
+
+	//Intel/CPU version doesn't handlel Adl scan well
+#if 0
+	{
+		copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+		btGpu3DGridBroadphase::scanOverlappingPairBuff();
+		copyArrayToDevice(m_dPairScanChanged, m_hPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+		m_numPrefixSum = m_hPairScanChanged[m_numHandles+1];
+		clFinish(m_cqCommandQue);
+		//memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+	}
+#else
+	{
+
+	//	copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2)); 
+	//	btGpu3DGridBroadphase::scanOverlappingPairBuff();
+
+		adl::Buffer<unsigned int> destBuffer;
+		
+		{
+			BT_PROFILE("copy GPU->GPU");
+		
+			destBuffer.m_ptr = (unsigned int*)m_dPairScanChanged;
+			destBuffer.m_device = m_deviceCL;
+			destBuffer.m_size =  sizeof(unsigned int)*(m_numHandles+2);
+			m_deviceCL->copy(m_srcClBuffer, &destBuffer,m_numHandles,1,1);
+
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+		}
+
+		{
+			BT_PROFILE("PrefixScan");
+			
+			adl::PrefixScan<adl::TYPE_CL>::execute(gData1,*m_srcClBuffer,destBuffer, m_numHandles+2,&m_numPrefixSum);
+			
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+		//if (m_numPrefixSum>0x1000)
+		//	{
+		//		printf("error m_numPrefixSum==%d\n",m_numPrefixSum);
+		//	}
+
+		}
+
+#if 0
+		unsigned int* verifyhPairScanChanged = new unsigned int[m_maxHandles + 2];
+		memset(verifyhPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+
+		copyArrayFromDevice(verifyhPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
+		clFinish(m_cqCommandQue);
+
+		/*for (int i=0;i<m_numHandles+2;i++)
+		{
+			if (verifyhPairScanChanged[i] != m_hPairScanChanged[i])
+			{
+				printf("hello!\n");
+			}
+		}
+		*/
+
+#endif
+
+
+		if (1)
+		{
+			
+			//the data 
+			if (copyToCpu)
+			{
+				BT_PROFILE("copy GPU -> CPU");
+				copyArrayFromDevice(m_hPairScanChanged, m_dPairScanChanged, sizeof(unsigned int)*(m_numHandles + 2));
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+			}
+
+		}
+
+	}
+#endif
+
+	
+}
+
+
+
+void bt3dGridBroadphaseOCL::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("btCuda_squeezeOverlappingPairBuff");
+#if 1
+	runKernelWithWorkgroupSize(GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF, m_numHandles);
+//	btCuda_squeezeOverlappingPairBuff(m_dPairBuff, m_dPairBuffStartCurr, m_dPairScanChanged, m_dPairsChanged, m_dAABB, m_numHandles);
+	
+	//copyArrayFromDevice(m_hPairsChanged, m_dPairsChanged, sizeof(unsigned int) * m_numPrefixSum);//m_hPairScanChanged[m_numHandles+1]); //gSum
+#ifdef ADD_BLOCKING_CL_FINISH_FOR_BENCHMARK
+	clFinish(m_cqCommandQue);
+#endif
+
+#else
+	btGpu3DGridBroadphase::squeezeOverlappingPairBuff();
+#endif
+	return;
+}
+
+
+
+void bt3dGridBroadphaseOCL::resetPool(btDispatcher* dispatcher)
+{
+	btGpu3DGridBroadphase::resetPool(dispatcher);
+	prefillBuffers();
+}
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/bt3dGridBroadphaseOCL.h
@@ -0,0 +1,146 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006 - 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+#ifndef BT3DGRIDBROADPHASEOCL_H
+#define BT3DGRIDBROADPHASEOCL_H
+
+#ifdef __APPLE__
+#ifdef USE_MINICL
+	#include <MiniCL/cl.h>
+#else
+	#include <MiniCL/cl.h>
+#endif
+//CL_PLATFORM_MINI_CL could be defined in build system
+#else
+//#include <GL/glew.h>
+// standard utility and system includes
+#ifdef USE_MINICL
+	#include <MiniCL/cl.h>
+#else
+	#include <CL/cl.h>
+#endif
+// Extra CL/GL include
+//#include <CL/cl_gl.h>
+#endif //__APPLE__
+
+namespace adl
+{
+	struct Device;
+	struct DeviceCL;
+};
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+#include "btGpu3DGridBroadphase.h"
+
+
+#define GRID3DOCL_CHECKERROR(a, b) if((a)!=(b)) { printf("3D GRID OCL Error : %d\n", (a)); btAssert((a) == (b)); }
+
+enum
+{
+	GRID3DOCL_KERNEL_CALC_HASH_AABB = 0,
+	GRID3DOCL_KERNEL_CLEAR_CELL_START,
+	GRID3DOCL_KERNEL_FIND_CELL_START,
+	GRID3DOCL_KERNEL_FIND_OVERLAPPING_PAIRS,
+	GRID3DOCL_KERNEL_FIND_PAIRS_LARGE,
+	GRID3DOCL_KERNEL_COMPUTE_CACHE_CHANGES,
+	GRID3DOCL_KERNEL_SQUEEZE_PAIR_BUFF,
+	GRID3DOCL_KERNEL_TOTAL
+};
+
+struct bt3dGridOCLKernelInfo
+{
+	int			m_Id;
+	cl_kernel	m_kernel;
+	char*		m_name;
+	int			m_workgroupSize;
+};
+
+
+///The bt3dGridBroadphaseOCL uses OpenCL-capable GPU to compute overlapping pairs
+
+class bt3dGridBroadphaseOCL : public btGpu3DGridBroadphase
+{
+protected:
+	int						m_hashSize;
+	cl_context				m_cxMainContext;
+	cl_device_id			m_cdDevice;
+	cl_command_queue		m_cqCommandQue;
+	cl_program				m_cpProgram;
+	bt3dGridOCLKernelInfo	m_kernels[GRID3DOCL_KERNEL_TOTAL];
+	// data buffers
+	cl_mem					m_dBodiesHash;
+	cl_mem					m_dCellStart;
+	cl_mem					m_dPairBuff; 
+	cl_mem					m_dPairBuffStartCurr;
+public:
+	cl_mem					m_dAABB;
+protected:
+	cl_mem					m_dPairScanChanged;
+	cl_mem					m_dPairsChanged;
+	cl_mem					m_dPairsContiguous;
+	cl_mem					m_dBpParams;
+
+	adl::Device*			m_deviceHost;
+	adl::DeviceCL*			m_deviceCL;
+	bool					m_ownsDevice;
+
+
+public:
+	unsigned int			m_numPrefixSum;
+
+	bt3dGridBroadphaseOCL(	btOverlappingPairCache* overlappingPairCache,
+							const btVector3& cellSize, 
+							int gridSizeX, int gridSizeY, int gridSizeZ, 
+							int maxSmallProxies, int maxLargeProxies, int maxPairsPerSmallProxy,
+							btScalar maxSmallProxySize,
+							int maxSmallProxiesPerCell = 8,
+							cl_context context = NULL,
+							cl_device_id device = NULL,
+							cl_command_queue queue = NULL,
+							adl::DeviceCL* deviceCL = 0
+							);
+	virtual ~bt3dGridBroadphaseOCL();
+
+protected:
+	void initCL(cl_context context, cl_device_id device, cl_command_queue queue);
+	void initKernels();
+	void allocateBuffers();
+	void prefillBuffers();
+	void initKernel(int kernelId, char* pName);
+	void allocateArray(void** devPtr, unsigned int size);
+	void freeArray(void* devPtr);
+	void runKernelWithWorkgroupSize(int kernelId, int globalSize);
+	void setKernelArg(int kernelId, int argNum, int argSize, void* argPtr);
+	void copyArrayToDevice(cl_mem device, const void* host, unsigned int size, int devOffs = 0, int hostOffs = 0);
+	void copyArrayFromDevice(void* host, const cl_mem device, unsigned int size, int hostOffs = 0, int devOffs = 0);
+
+// overrides
+	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff(bool copyToCpu=true);
+	virtual void squeezeOverlappingPairBuff();
+	virtual void resetPool(btDispatcher* dispatcher);
+};
+
+#endif //BT3DGRIDBROADPHASEOCL_H
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.cpp
@@ -0,0 +1,626 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+///The 3 following lines include the CPU implementation of the kernels, keep them in this order.
+#include "btGpuDefines.h"
+#include "btGpuUtilsSharedDefs.h"
+#include "btGpuUtilsSharedCode.h"
+
+
+
+#include "LinearMath/btAlignedAllocator.h"
+#include "LinearMath/btQuickprof.h"
+#include "BulletCollision/BroadphaseCollision/btOverlappingPairCache.h"
+
+
+
+#include "btGpuDefines.h"
+#include "btGpuUtilsSharedDefs.h"
+
+#include "btGpu3DGridBroadphaseSharedDefs.h"
+
+#include "btGpu3DGridBroadphase.h"
+#include <string.h> //for memset
+
+
+#include <stdio.h>
+
+
+
+static bt3DGridBroadphaseParams s3DGridBroadphaseParams;
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	const btVector3& cellSize, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell) :
+	btSimpleBroadphase(maxSmallProxies,
+//				     new (btAlignedAlloc(sizeof(btSortedOverlappingPairCache),16)) btSortedOverlappingPairCache),
+				     new (btAlignedAlloc(sizeof(btHashedOverlappingPairCache),16)) btHashedOverlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxSmallProxySize, maxBodiesPerCell);
+}
+
+
+
+btGpu3DGridBroadphase::btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+										const btVector3& cellSize, 
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell) :
+	btSimpleBroadphase(maxSmallProxies, overlappingPairCache),
+	m_bInitialized(false),
+    m_numBodies(0)
+{
+	_initialize(cellSize, gridSizeX, gridSizeY, gridSizeZ, 
+				maxSmallProxies, maxLargeProxies, maxPairsPerBody,
+				maxSmallProxySize, maxBodiesPerCell);
+}
+
+
+
+btGpu3DGridBroadphase::~btGpu3DGridBroadphase()
+{
+	//btSimpleBroadphase will free memory of btSortedOverlappingPairCache, because m_ownsPairCache
+	assert(m_bInitialized);
+	_finalize();
+
+	
+}
+
+// returns 2^n : 2^(n+1) > val >= 2^n
+int btGpu3DGridBroadphase::getFloorPowOfTwo(int val)
+{
+	int mask = 0x40000000;
+	for(int k = 0; k < 30; k++, mask >>= 1)
+	{
+		if(mask & val)
+		{
+			break;
+		}
+	}
+	return mask;
+}
+
+
+
+void btGpu3DGridBroadphase::_initialize(	const btVector3& cellSize,
+										int gridSizeX, int gridSizeY, int gridSizeZ, 
+										int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+										btScalar maxSmallProxySize,
+										int maxBodiesPerCell)
+{
+	// set various paramerers
+	m_ownsPairCache = true;
+	m_params.m_gridSizeX = getFloorPowOfTwo(gridSizeX);
+	m_params.m_gridSizeY = getFloorPowOfTwo(gridSizeY);
+	m_params.m_gridSizeZ = getFloorPowOfTwo(gridSizeZ);
+	m_params.m_numCells = m_params.m_gridSizeX * m_params.m_gridSizeY * m_params.m_gridSizeZ;
+	m_numCells = m_params.m_numCells;
+	m_params.m_invCellSizeX = btScalar(1.f) / cellSize[0];
+	m_params.m_invCellSizeY = btScalar(1.f) / cellSize[1];
+	m_params.m_invCellSizeZ = btScalar(1.f) / cellSize[2];
+	m_maxRadius = maxSmallProxySize * btScalar(0.5f);
+	m_params.m_numBodies = m_numBodies;
+	m_params.m_maxBodiesPerCell = maxBodiesPerCell;
+
+	m_numLargeHandles = 0;						
+	m_maxLargeHandles = maxLargeProxies;
+
+	m_maxPairsPerBody = maxPairsPerBody;
+
+	m_LastLargeHandleIndex = -1;
+
+    assert(!m_bInitialized);
+	
+    // allocate host storage
+    m_hBodiesHash = new unsigned int[m_maxHandles * 2];
+    memset(m_hBodiesHash, 0x00, m_maxHandles*2*sizeof(unsigned int));
+
+    m_hCellStart = new unsigned int[m_params.m_numCells];
+    memset(m_hCellStart, 0x00, m_params.m_numCells * sizeof(unsigned int));
+
+	m_hPairBuffStartCurr = new unsigned int[m_maxHandles * 2 + 2];
+	// --------------- for now, init with m_maxPairsPerBody for each body
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+	//----------------
+	unsigned int numAABB = m_maxHandles + m_maxLargeHandles;
+	m_hAABB = new bt3DGrid3F1U[numAABB * 2]; // AABB Min & Max
+
+	m_hPairBuff = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairBuff, 0x00, m_maxHandles * m_maxPairsPerBody * sizeof(unsigned int)); // needed?
+
+	m_hPairScanChanged = new unsigned int[m_maxHandles + 2];
+	memset(m_hPairScanChanged,0,sizeof(int)*m_maxHandles + 2);
+
+	m_hPairsChanged = new unsigned int[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hPairsChanged,0,sizeof(int)*(m_maxHandles * m_maxPairsPerBody));
+
+	m_hAllOverlappingPairs= new MyUint2[m_maxHandles * m_maxPairsPerBody];
+	memset(m_hAllOverlappingPairs,0,sizeof(MyUint2)*(m_maxHandles * m_maxPairsPerBody));
+
+
+// large proxies
+
+	// allocate handles buffer and put all handles on free list
+	m_pLargeHandlesRawPtr = btAlignedAlloc(sizeof(btSimpleBroadphaseProxy) * m_maxLargeHandles, 16);
+	m_pLargeHandles = new(m_pLargeHandlesRawPtr) btSimpleBroadphaseProxy[m_maxLargeHandles];
+	m_firstFreeLargeHandle = 0;
+	{
+		for (int i = m_firstFreeLargeHandle; i < m_maxLargeHandles; i++)
+		{
+			m_pLargeHandles[i].SetNextFree(i + 1);
+			m_pLargeHandles[i].m_uniqueId = m_maxHandles+2+i;
+		}
+		m_pLargeHandles[m_maxLargeHandles - 1].SetNextFree(0);
+	}
+
+// debug data
+	m_numPairsAdded = 0;
+	m_numOverflows = 0;
+
+	
+    m_bInitialized = true;
+}
+
+
+
+void btGpu3DGridBroadphase::_finalize()
+{
+    assert(m_bInitialized);
+    delete [] m_hBodiesHash;
+    delete [] m_hCellStart;
+    delete [] m_hPairBuffStartCurr;
+    delete [] m_hAABB;
+	delete [] m_hPairBuff;
+	delete [] m_hPairScanChanged;
+	delete [] m_hPairsChanged;
+	delete [] m_hAllOverlappingPairs;
+	btAlignedFree(m_pLargeHandlesRawPtr);
+	m_bInitialized = false;
+}
+
+
+
+void btGpu3DGridBroadphase::calculateOverlappingPairs(btDispatcher* dispatcher)
+{
+	btSimpleBroadphase::calculateOverlappingPairs(dispatcher);
+
+	if(m_numHandles <= 0)
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+		return;
+	}
+	// update constants
+	{
+		BT_PROFILE("setParameters");
+		setParameters(&m_params);
+	}
+
+	// prepare AABB array
+	{
+		BT_PROFILE("prepareAABB");
+		prepareAABB();
+	}
+	// calculate hash
+	{
+		BT_PROFILE("calcHashAABB");
+		calcHashAABB();
+	}
+	{
+		BT_PROFILE("sortHash");
+		// sort bodies based on hash
+		sortHash();
+	}
+	// find start of each cell
+	{
+		BT_PROFILE("findCellStart");
+		findCellStart();
+	}
+	{
+		BT_PROFILE("findOverlappingPairs");
+		// findOverlappingPairs (small/small)
+		findOverlappingPairs();
+	}
+	// findOverlappingPairs (small/large)
+	{
+		BT_PROFILE("findPairsLarge");
+		findPairsLarge();
+	}
+	// add pairs to CPU cache
+	{
+		BT_PROFILE("computePairCacheChanges");
+		computePairCacheChanges();
+	}
+	{
+		BT_PROFILE("scanOverlappingPairBuff");
+		scanOverlappingPairBuff();
+	}
+	{
+		BT_PROFILE("squeezeOverlappingPairBuff");
+		squeezeOverlappingPairBuff();
+	}
+	{
+		BT_PROFILE("addPairsToCache");
+		addPairsToCache(dispatcher);
+	}
+	// find and add large/large pairs to CPU cache
+	{
+		BT_PROFILE("addLarge2LargePairsToCache");
+		addLarge2LargePairsToCache(dispatcher);
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::addPairsToCache(btDispatcher* dispatcher)
+{
+	m_numPairsAdded = 0;
+	m_numPairsRemoved = 0;
+	for(int i = 0; i < m_numHandles; i++) 
+	{
+		unsigned int num = m_hPairScanChanged[i+2] - m_hPairScanChanged[i+1];
+		if(!num)
+		{
+			continue;
+		}
+		unsigned int* pInp = m_hPairsChanged + m_hPairScanChanged[i+1];
+		unsigned int index0 = m_hAABB[i * 2].uw;
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[index0];
+		for(unsigned int j = 0; j < num; j++)
+		{
+			unsigned int indx1_s = pInp[j];
+			unsigned int index1 = indx1_s & (~BT_3DGRID_PAIR_ANY_FLG);
+			btSimpleBroadphaseProxy* proxy1;
+			if(index1 < (unsigned int)m_maxHandles)
+			{
+				proxy1 = &m_pHandles[index1];
+			}
+			else
+			{
+				index1 -= m_maxHandles;
+				btAssert((index1 >= 0) && (index1 < (unsigned int)m_maxLargeHandles));
+				proxy1 = &m_pLargeHandles[index1];
+			}
+			if(indx1_s & BT_3DGRID_PAIR_NEW_FLG)
+			{
+				m_pairCache->addOverlappingPair(proxy0,proxy1);
+				m_numPairsAdded++;
+			}
+			else
+			{
+				m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				m_numPairsRemoved++;
+			}
+		}
+	}
+}
+
+
+
+btBroadphaseProxy* btGpu3DGridBroadphase::createProxy(  const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy)
+{
+	btBroadphaseProxy*  proxy;
+	bool bIsLarge = isLargeProxy(aabbMin, aabbMax);
+	if(bIsLarge)
+	{
+		if (m_numLargeHandles >= m_maxLargeHandles)
+		{
+			///you have to increase the cell size, so 'large' proxies become 'small' proxies (fitting a cell)
+			btAssert(0);
+			return 0; //should never happen, but don't let the game crash ;-)
+		}
+		btAssert((aabbMin[0]<= aabbMax[0]) && (aabbMin[1]<= aabbMax[1]) && (aabbMin[2]<= aabbMax[2]));
+		int newHandleIndex = allocLargeHandle();
+		proxy = new (&m_pLargeHandles[newHandleIndex])btSimpleBroadphaseProxy(aabbMin,aabbMax,shapeType,userPtr,collisionFilterGroup,collisionFilterMask,multiSapProxy);
+	}
+	else
+	{
+		proxy = btSimpleBroadphase::createProxy(aabbMin, aabbMax, shapeType, userPtr, collisionFilterGroup, collisionFilterMask, dispatcher, multiSapProxy);
+	}
+	return proxy;
+}
+
+
+
+void btGpu3DGridBroadphase::destroyProxy(btBroadphaseProxy* proxy, btDispatcher* dispatcher)
+{
+	bool bIsLarge = isLargeProxy(proxy);
+	if(bIsLarge)
+	{
+		
+		btSimpleBroadphaseProxy* proxy0 = static_cast<btSimpleBroadphaseProxy*>(proxy);
+		freeLargeHandle(proxy0);
+		m_pairCache->removeOverlappingPairsContainingProxy(proxy,dispatcher);
+	}
+	else
+	{
+		btSimpleBroadphase::destroyProxy(proxy, dispatcher);
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::resetPool(btDispatcher* dispatcher)
+{
+	m_hPairBuffStartCurr[0] = 0;
+	m_hPairBuffStartCurr[1] = 0;
+	for(int i = 1; i <= m_maxHandles; i++) 
+	{
+		m_hPairBuffStartCurr[i * 2] = m_hPairBuffStartCurr[(i-1) * 2] + m_maxPairsPerBody;
+		m_hPairBuffStartCurr[i * 2 + 1] = 0;
+	}
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax)
+{
+	btVector3 diag = aabbMax - aabbMin;
+	///use the bounding sphere radius of this bounding box, to include rotation
+	btScalar radius = diag.length() * btScalar(0.5f);
+	return (radius > m_maxRadius);
+}
+
+
+
+bool btGpu3DGridBroadphase::isLargeProxy(btBroadphaseProxy* proxy)
+{
+	return (proxy->getUid() >= (m_maxHandles+2));
+}
+
+
+
+void btGpu3DGridBroadphase::addLarge2LargePairsToCache(btDispatcher* dispatcher)
+{
+	int i,j;
+	if (m_numLargeHandles <= 0)
+	{
+		return;
+	}
+	int new_largest_index = -1;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		new_largest_index = i;
+		for(j = i + 1; j <= m_LastLargeHandleIndex; j++)
+		{
+			btSimpleBroadphaseProxy* proxy1 = &m_pLargeHandles[j];
+			btAssert(proxy0 != proxy1);
+			btSimpleBroadphaseProxy* p0 = getSimpleProxyFromProxy(proxy0);
+			btSimpleBroadphaseProxy* p1 = getSimpleProxyFromProxy(proxy1);
+			if(aabbOverlap(p0,p1))
+			{
+				if (!m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->addOverlappingPair(proxy0,proxy1);
+				}
+			} 
+			else
+			{
+				if(m_pairCache->findPair(proxy0,proxy1))
+				{
+					m_pairCache->removeOverlappingPair(proxy0,proxy1,dispatcher);
+				}
+			}
+		}
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback)
+{
+	btSimpleBroadphase::rayTest(rayFrom, rayTo, rayCallback);
+	for (int i=0; i <= m_LastLargeHandleIndex; i++)
+	{
+		btSimpleBroadphaseProxy* proxy = &m_pLargeHandles[i];
+		rayCallback.process(proxy);
+	}
+}
+
+
+
+//
+// overrides for CPU version
+//
+
+
+
+void btGpu3DGridBroadphase::prepareAABB()
+{
+	BT_PROFILE("prepareAABB");
+	bt3DGrid3F1U* pBB = m_hAABB;
+	int i;
+	int new_largest_index = -1;
+	unsigned int num_small = 0;
+	for(i = 0; i <= m_LastHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pHandles[i];
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_small;
+		pBB++;
+		num_small++;
+	}
+	m_LastHandleIndex = new_largest_index;
+	new_largest_index = -1;
+	unsigned int num_large = 0;
+	for(i = 0; i <= m_LastLargeHandleIndex; i++) 
+	{
+		btSimpleBroadphaseProxy* proxy0 = &m_pLargeHandles[i];
+		new_largest_index = i;
+		pBB->fx = proxy0->m_aabbMin.getX();
+		pBB->fy = proxy0->m_aabbMin.getY();
+		pBB->fz = proxy0->m_aabbMin.getZ();
+		pBB->uw = i + m_maxHandles;
+		pBB++;
+		pBB->fx = proxy0->m_aabbMax.getX();
+		pBB->fy = proxy0->m_aabbMax.getY();
+		pBB->fz = proxy0->m_aabbMax.getZ();
+		pBB->uw = num_large + m_maxHandles;
+		pBB++;
+		num_large++;
+	}
+	m_LastLargeHandleIndex = new_largest_index;
+	// paranoid checks
+	btAssert(num_small == m_numHandles);
+	btAssert(num_large == m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::setParameters(bt3DGridBroadphaseParams* hostParams)
+{
+	s3DGridBroadphaseParams = *hostParams;
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::calcHashAABB()
+{
+	BT_PROFILE("bt3DGrid_calcHashAABB");
+	btGpu_calcHashAABB(m_hAABB, m_hBodiesHash, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::sortHash()
+{
+	class bt3DGridHashKey
+	{
+	public:
+	   unsigned int hash;
+	   unsigned int index;
+	   void quickSort(bt3DGridHashKey* pData, int lo, int hi)
+	   {
+			int i=lo, j=hi;
+			bt3DGridHashKey x = pData[(lo+hi)/2];
+			do
+			{    
+				while(pData[i].hash > x.hash) i++; 
+				while(x.hash > pData[j].hash) j--;
+				if(i <= j)
+				{
+					bt3DGridHashKey t = pData[i];
+					pData[i] = pData[j];
+					pData[j] = t;
+					i++; j--;
+				}
+			} while(i <= j);
+			if(lo < j) pData->quickSort(pData, lo, j);
+			if(i < hi) pData->quickSort(pData, i, hi);
+	   }
+	};
+	BT_PROFILE("bt3DGrid_sortHash");
+	bt3DGridHashKey* pHash = (bt3DGridHashKey*)m_hBodiesHash;
+	pHash->quickSort(pHash, 0, m_numHandles - 1);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findCellStart()
+{
+	BT_PROFILE("bt3DGrid_findCellStart");
+	btGpu_findCellStart(m_hBodiesHash, m_hCellStart, m_numHandles, m_params.m_numCells);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findOverlappingPairs()
+{
+	BT_PROFILE("bt3DGrid_findOverlappingPairs");
+	btGpu_findOverlappingPairs(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr, m_numHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::findPairsLarge()
+{
+	BT_PROFILE("bt3DGrid_findPairsLarge");
+	btGpu_findPairsLarge(m_hAABB, m_hBodiesHash, m_hCellStart, m_hPairBuff, m_hPairBuffStartCurr,	m_numHandles, m_numLargeHandles);
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::computePairCacheChanges()
+{
+	BT_PROFILE("bt3DGrid_computePairCacheChanges");
+	btGpu_computePairCacheChanges(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hAABB, m_numHandles);
+	return;
+}
+
+
+void btGpu3DGridBroadphase::scanOverlappingPairBuff(bool copyToCpu)
+{
+	BT_PROFILE("bt3DGrid_scanOverlappingPairBuff");
+	unsigned int sum = 0;
+	m_hPairScanChanged[0]=0;
+	for(int i = 0; i <= m_numHandles+1; i++) 
+	{
+		unsigned int delta = m_hPairScanChanged[i];
+		m_hPairScanChanged[i] = sum;
+		sum += delta;
+	}
+	return;
+}
+
+
+
+void btGpu3DGridBroadphase::squeezeOverlappingPairBuff()
+{
+	BT_PROFILE("bt3DGrid_squeezeOverlappingPairBuff");
+	//btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, m_hPairsChanged, m_hAABB, m_numHandles);
+	btGpu_squeezeOverlappingPairBuff(m_hPairBuff, m_hPairBuffStartCurr, m_hPairScanChanged, (unsigned int*)m_hAllOverlappingPairs, m_hAABB, m_numHandles);
+	
+	return;
+}
+
+
+
+#include "btGpu3DGridBroadphaseSharedCode.h"
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphase.h
@@ -0,0 +1,154 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASE_H
+#define BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+
+#include "BulletCollision/BroadphaseCollision/btSimpleBroadphase.h"
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+struct MyUint2
+{
+	int x;
+	int y;
+};
+
+//----------------------------------------------------------------------------------------
+
+///The btGpu3DGridBroadphase uses GPU-style code compiled for CPU to compute overlapping pairs
+
+class btGpu3DGridBroadphase : public btSimpleBroadphase
+{
+protected:
+	bool			m_bInitialized;
+    unsigned int	m_numBodies;
+    unsigned int	m_numCells;
+	unsigned int	m_maxPairsPerBody;
+    unsigned int	m_maxBodiesPerCell;
+	bt3DGridBroadphaseParams m_params;
+	btScalar		m_maxRadius;
+	// CPU data
+    unsigned int*	m_hBodiesHash;
+    unsigned int*	m_hCellStart;
+	unsigned int*	m_hPairBuffStartCurr;
+	bt3DGrid3F1U*	m_hAABB;
+	unsigned int*	m_hPairBuff;
+	unsigned int*	m_hPairScanChanged;
+	unsigned int*	m_hPairsChanged;
+	MyUint2*		m_hAllOverlappingPairs;
+// large proxies
+	int		m_numLargeHandles;						
+	int		m_maxLargeHandles;						
+	int		m_LastLargeHandleIndex;							
+	btSimpleBroadphaseProxy* m_pLargeHandles;
+	void* m_pLargeHandlesRawPtr;
+	int		m_firstFreeLargeHandle;
+	int allocLargeHandle()
+	{
+		btAssert(m_numLargeHandles < m_maxLargeHandles);
+		int freeLargeHandle = m_firstFreeLargeHandle;
+		m_firstFreeLargeHandle = m_pLargeHandles[freeLargeHandle].GetNextFree();
+		m_numLargeHandles++;
+		if(freeLargeHandle > m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex = freeLargeHandle;
+		}
+		return freeLargeHandle;
+	}
+	void freeLargeHandle(btSimpleBroadphaseProxy* proxy)
+	{
+		int handle = int(proxy - m_pLargeHandles);
+		btAssert((handle >= 0) && (handle < m_maxHandles));
+		if(handle == m_LastLargeHandleIndex)
+		{
+			m_LastLargeHandleIndex--;
+		}
+		proxy->SetNextFree(m_firstFreeLargeHandle);
+		m_firstFreeLargeHandle = handle;
+		proxy->m_clientObject = 0;
+		m_numLargeHandles--;
+	}
+	bool isLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax);
+	bool isLargeProxy(btBroadphaseProxy* proxy);
+// debug
+	unsigned int	m_numPairsAdded;
+	unsigned int	m_numPairsRemoved;
+	unsigned int	m_numOverflows;
+// 
+public:
+	virtual int getNumOverlap()
+	{
+		return m_hPairScanChanged[m_numHandles+1];
+	}
+	virtual MyUint2* getOverlap()
+	{
+		return m_hAllOverlappingPairs;
+	}
+	// NOTE : for better results gridSizeX, gridSizeY and gridSizeZ should be powers of 2 
+	btGpu3DGridBroadphase(const btVector3& cellSize, 
+					   int gridSizeX, int gridSizeY, int gridSizeZ, 
+					   int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+					   int maxBodiesPerCell = 8);
+	btGpu3DGridBroadphase(	btOverlappingPairCache* overlappingPairCache,
+						const btVector3& cellSize, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+						int maxBodiesPerCell = 8);
+	virtual ~btGpu3DGridBroadphase();
+	virtual void	calculateOverlappingPairs(btDispatcher* dispatcher);
+
+	virtual btBroadphaseProxy*	createProxy(const btVector3& aabbMin,  const btVector3& aabbMax,int shapeType,void* userPtr ,short int collisionFilterGroup,short int collisionFilterMask, btDispatcher* dispatcher,void* multiSapProxy);
+	virtual void	destroyProxy(btBroadphaseProxy* proxy,btDispatcher* dispatcher);
+	virtual void	rayTest(const btVector3& rayFrom,const btVector3& rayTo, btBroadphaseRayCallback& rayCallback);
+	virtual void	resetPool(btDispatcher* dispatcher);
+
+	static int		getFloorPowOfTwo(int val); // returns 2^n : 2^(n+1) > val >= 2^n
+
+protected:
+	void _initialize(	const btVector3& cellSize, 
+						int gridSizeX, int gridSizeY, int gridSizeZ, 
+						int maxSmallProxies, int maxLargeProxies, int maxPairsPerBody,
+						btScalar maxSmallProxySize,
+						int maxBodiesPerCell);
+	void _finalize();
+	void addPairsToCache(btDispatcher* dispatcher);
+	void addLarge2LargePairsToCache(btDispatcher* dispatcher);
+
+// overrides for CPU version
+	virtual void setParameters(bt3DGridBroadphaseParams* hostParams);
+	virtual void prepareAABB();
+	virtual void calcHashAABB();
+	virtual void sortHash();	
+	virtual void findCellStart();
+	virtual void findOverlappingPairs();
+	virtual void findPairsLarge();
+	virtual void computePairCacheChanges();
+	virtual void scanOverlappingPairBuff(bool copyToCpu=true);
+	virtual void squeezeOverlappingPairBuff();
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif //BTGPU3DGRIDBROADPHASE_H
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedCode.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedCode.h
@@ -0,0 +1,428 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+// calculate position in uniform grid
+BT_GPU___device__ int3 bt3DGrid_calcGridPos(float4 p)
+{
+    int3 gridPos;
+    gridPos.x = (int)floor(p.x * BT_GPU_params.m_invCellSizeX) & (BT_GPU_params.m_gridSizeX - 1);
+    gridPos.y = (int)floor(p.y * BT_GPU_params.m_invCellSizeY) & (BT_GPU_params.m_gridSizeY - 1);
+    gridPos.z = (int)floor(p.z * BT_GPU_params.m_invCellSizeZ) & (BT_GPU_params.m_gridSizeZ - 1);
+    return gridPos;
+} // bt3DGrid_calcGridPos()
+
+//----------------------------------------------------------------------------------------
+
+// calculate address in grid from position (clamping to edges)
+BT_GPU___device__ uint bt3DGrid_calcGridHash(int3 gridPos)
+{
+	gridPos.x &= (BT_GPU_params.m_gridSizeX - 1);
+	gridPos.y &= (BT_GPU_params.m_gridSizeY - 1);
+	gridPos.z &= (BT_GPU_params.m_gridSizeZ - 1);
+    return BT_GPU___mul24(BT_GPU___mul24(gridPos.z, BT_GPU_params.m_gridSizeY), BT_GPU_params.m_gridSizeX) + BT_GPU___mul24(gridPos.y, BT_GPU_params.m_gridSizeX) + gridPos.x;
+} // bt3DGrid_calcGridHash()
+
+//----------------------------------------------------------------------------------------
+
+// calculate grid hash value for each body using its AABB
+BT_GPU___global__ void calcHashAABBD(bt3DGrid3F1U* pAABB, uint2* pHash, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index*2];
+	bt3DGrid3F1U bbMax = pAABB[index*2 + 1];
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // store grid hash and body index
+    pHash[index] = BT_GPU_make_uint2(gridHash, index);
+} // calcHashAABBD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findCellStartD(uint2* pHash, uint* cellStart, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	// Load hash data into shared memory so that we can look 
+	// at neighboring body's hash value without loading
+	// two hash values per thread
+	BT_GPU___shared__ uint sharedHash[257];
+	sharedHash[BT_GPU_threadIdx.x+1] = sortedData.x;
+	if((index > 0) && (BT_GPU_threadIdx.x == 0))
+	{
+		// first thread in block must load neighbor body hash
+		volatile uint2 prevData = pHash[index-1];
+		sharedHash[0] = prevData.x;
+	}
+	BT_GPU___syncthreads();
+	if((index == 0) || (sortedData.x != sharedHash[BT_GPU_threadIdx.x]))
+	{
+		cellStart[sortedData.x] = index;
+	}
+} // findCellStartD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ uint cudaTestAABBOverlap(bt3DGrid3F1U min0, bt3DGrid3F1U max0, bt3DGrid3F1U min1, bt3DGrid3F1U max1)
+{
+	return	(min0.fx <= max1.fx)&& (min1.fx <= max0.fx) && 
+			(min0.fy <= max1.fy)&& (min1.fy <= max0.fy) && 
+			(min0.fz <= max1.fz)&& (min1.fz <= max0.fz); 
+} // cudaTestAABBOverlap()
+ 
+//----------------------------------------------------------------------------------------
+
+BT_GPU___device__ void findPairsInCell(	int3	gridPos,
+										uint    index,
+										uint2*  pHash,
+										uint*   pCellStart,
+										bt3DGrid3F1U* pAABB, 
+										uint*   pPairBuff,
+										uint2*	pPairBuffStartCurr,
+										uint	numBodies)
+{
+    uint gridHash = bt3DGrid_calcGridHash(gridPos);
+    // get start of bucket for this cell
+    uint bucketStart = pCellStart[gridHash];
+    if (bucketStart == 0xffffffff)
+	{
+        return;   // cell empty
+	}
+	// iterate over bodies in this cell
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+    bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2); 
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+	uint bucketEnd = bucketStart + BT_GPU_params.m_maxBodiesPerCell;
+	bucketEnd = (bucketEnd > numBodies) ? numBodies : bucketEnd;
+	for(uint index2 = bucketStart; index2 < bucketEnd; index2++) 
+	{
+        uint2 cellData = pHash[index2];
+        if (cellData.x != gridHash)
+        {
+			break;   // no longer in same bucket
+		}
+		uint unsorted_indx2 = cellData.y;
+        if (unsorted_indx2 < unsorted_indx) // check not colliding with self
+        {   
+			bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2);
+			bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, unsorted_indx2*2 + 1);
+			if(cudaTestAABBOverlap(min0, max0, min1, max1))
+			{
+				uint handleIndex2 = min1.uw;
+				uint k;
+				for(k = 0; k < curr; k++)
+				{
+					uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+					if(old_pair == handleIndex2)
+					{
+						pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+						break;
+					}
+				}
+				if(k == curr)
+				{
+					if(curr >= curr_max) 
+					{ // not a good solution, but let's avoid crash
+						break;
+					}
+					pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+					curr++;
+				}
+			}
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsInCell()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findOverlappingPairsD(	bt3DGrid3F1U*	pAABB, uint2* pHash, uint* pCellStart, 
+												uint* pPairBuff, uint2* pPairBuffStartCurr, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U bbMin = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U bbMax = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	float4 pos;
+	pos.x = (bbMin.fx + bbMax.fx) * 0.5f;
+	pos.y = (bbMin.fy + bbMax.fy) * 0.5f;
+	pos.z = (bbMin.fz + bbMax.fz) * 0.5f;
+    // get address in grid
+    int3 gridPos = bt3DGrid_calcGridPos(pos);
+    // examine only neighbouring cells
+    for(int z=-1; z<=1; z++) {
+        for(int y=-1; y<=1; y++) {
+            for(int x=-1; x<=1; x++) {
+                findPairsInCell(gridPos + BT_GPU_make_int3(x, y, z), index, pHash, pCellStart, pAABB, pPairBuff, pPairBuffStartCurr, numBodies);
+            }
+        }
+    }
+} // findOverlappingPairsD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void findPairsLargeD(	bt3DGrid3F1U* pAABB, uint2* pHash, uint* pCellStart, uint* pPairBuff, 
+										uint2* pPairBuffStartCurr, uint numBodies, uint numLarge)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+    uint2 sortedData = pHash[index];
+	uint unsorted_indx = sortedData.y;
+	bt3DGrid3F1U min0 = BT_GPU_FETCH(pAABB, unsorted_indx*2);
+	bt3DGrid3F1U max0 = BT_GPU_FETCH(pAABB, unsorted_indx*2 + 1);
+	uint handleIndex =  min0.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint2 start_curr_next = pPairBuffStartCurr[handleIndex+1];
+	uint curr_max = start_curr_next.x - start - 1;
+    for(uint i = 0; i < numLarge; i++)
+    {
+		uint indx2 = numBodies + i;
+		bt3DGrid3F1U min1 = BT_GPU_FETCH(pAABB, indx2*2);
+		bt3DGrid3F1U max1 = BT_GPU_FETCH(pAABB, indx2*2 + 1);
+		if(cudaTestAABBOverlap(min0, max0, min1, max1))
+		{
+			uint k;
+			uint handleIndex2 =  min1.uw;
+			for(k = 0; k < curr; k++)
+			{
+				uint old_pair = pPairBuff[start+k] & (~BT_3DGRID_PAIR_ANY_FLG);
+				if(old_pair == handleIndex2)
+				{
+					pPairBuff[start+k] |= BT_3DGRID_PAIR_FOUND_FLG;
+					break;
+				}
+			}
+			if(k == curr)
+			{
+				pPairBuff[start+curr] = handleIndex2 | BT_3DGRID_PAIR_NEW_FLG;
+				if(curr >= curr_max) 
+				{ // not a good solution, but let's avoid crash
+					break;
+				}
+				curr++;
+			}
+		}
+    }
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, curr);
+    return;
+} // findPairsLargeD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void computePairCacheChangesD(uint* pPairBuff, uint2* pPairBuffStartCurr, 
+												uint* pPairScan, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint *pInp = pPairBuff + start;
+	uint num_changes = 0;
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		if(((*pInp) & BT_3DGRID_PAIR_ANY_FLG))
+		{
+			num_changes++;
+		}
+	}
+	pPairScan[index+1] = num_changes;
+} // computePairCacheChangesD()
+
+//----------------------------------------------------------------------------------------
+
+BT_GPU___global__ void squeezeOverlappingPairBuffD(uint* pPairBuff, uint2* pPairBuffStartCurr, uint* pPairScan,
+												   uint2* pPairOut, bt3DGrid3F1U* pAABB, uint numBodies)
+{
+    int index = BT_GPU___mul24(BT_GPU_blockIdx.x, BT_GPU_blockDim.x) + BT_GPU_threadIdx.x;
+    if(index >= (int)numBodies)
+	{
+		return;
+	}
+	bt3DGrid3F1U bbMin = pAABB[index * 2];
+	uint handleIndex = bbMin.uw;
+	uint2 start_curr = pPairBuffStartCurr[handleIndex];
+	uint start = start_curr.x;
+	uint curr = start_curr.y;
+	uint* pInp = pPairBuff + start;
+	uint2* pOut = pPairOut + pPairScan[index+1];
+	uint* pOut2 = pInp;
+	uint num = 0; 
+	for(uint k = 0; k < curr; k++, pInp++)
+	{
+		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
+		//if(!((*pInp) & BT_3DGRID_PAIR_FOUND_FLG))
+		{
+			pOut->x = handleIndex;
+			pOut->y = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
+
+			pOut++;
+		}
+		if((*pInp) & BT_3DGRID_PAIR_ANY_FLG)
+		{
+			*pOut2 = (*pInp) & (~BT_3DGRID_PAIR_ANY_FLG);
+			pOut2++;
+			num++;
+		}
+	}
+	pPairBuffStartCurr[handleIndex] = BT_GPU_make_uint2(start, num);
+} // squeezeOverlappingPairBuffD()
+
+
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//               E N D   O F    K E R N E L    F U N C T I O N S 
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies)
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    // execute the kernel
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, calcHashAABBD, (pAABB, (uint2*)hash, numBodies));
+    // check if kernel invocation generated an error
+    BT_GPU_CHECK_ERROR("calcHashAABBD kernel execution failed");
+} // calcHashAABB()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findCellStart(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+	BT_GPU_SAFE_CALL(BT_GPU_Memset(cellStart, 0xffffffff, numCells*sizeof(uint)));
+	BT_GPU_EXECKERNEL(numBlocks, numThreads, findCellStartD, ((uint2*)hash, (uint*)cellStart, numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: findCellStartD");
+} // findCellStart()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findOverlappingPairs(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, numBodies * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findOverlappingPairsD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: bt_CudaFindOverlappingPairsD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findOverlappingPairs()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(findPairsLarge(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge))
+{
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaBindTexture(0, pAABBTex, pAABB, (numBodies+numLarge) * 2 * sizeof(bt3DGrid3F1U)));
+#endif
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 64, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, findPairsLargeD, (pAABB,(uint2*)pHash,(uint*)pCellStart,(uint*)pPairBuff,(uint2*)pPairBuffStartCurr,numBodies,numLarge));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCuda_findPairsLargeD");
+#if B_CUDA_USE_TEX
+    BT_GPU_SAFE_CALL(cudaUnbindTexture(pAABBTex));
+#endif
+} // findPairsLarge()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(computePairCacheChanges(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, computePairCacheChangesD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaComputePairCacheChangesD");
+} // computePairCacheChanges()
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan,  unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies))
+{
+    int numThreads, numBlocks;
+    BT_GPU_PREF(computeGridSize)(numBodies, 256, numBlocks, numThreads);
+    BT_GPU_EXECKERNEL(numBlocks, numThreads, squeezeOverlappingPairBuffD, ((uint*)pPairBuff,(uint2*)pPairBuffStartCurr,(uint*)pPairScan,(uint2*)pPairOut,pAABB,numBodies));
+    BT_GPU_CHECK_ERROR("Kernel execution failed: btCudaSqueezeOverlappingPairBuffD");
+} // btCuda_squeezeOverlappingPairBuff()
+
+//------------------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
+//------------------------------------------------------------------------------------------------
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedDefs.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedDefs.h
@@ -0,0 +1,61 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+#define BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpu3DGridBroadphaseSharedTypes.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+void BT_GPU_PREF(calcHashAABB)(bt3DGrid3F1U* pAABB, unsigned int* hash,	unsigned int numBodies);
+
+void BT_GPU_PREF(findCellStart)(unsigned int* hash, unsigned int* cellStart, unsigned int numBodies, unsigned int numCells);
+
+void BT_GPU_PREF(findOverlappingPairs)(bt3DGrid3F1U* pAABB, unsigned int* pHash,	unsigned int* pCellStart, unsigned int*	pPairBuff, unsigned int*	pPairBuffStartCurr, unsigned int	numBodies);
+
+void BT_GPU_PREF(findPairsLarge)(bt3DGrid3F1U* pAABB, unsigned int* pHash, unsigned int* pCellStart, unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int numBodies, unsigned int numLarge);
+
+void BT_GPU_PREF(computePairCacheChanges)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+void BT_GPU_PREF(squeezeOverlappingPairBuff)(unsigned int* pPairBuff, unsigned int* pPairBuffStartCurr, unsigned int* pPairScan, unsigned int* pPairOut, bt3DGrid3F1U* pAABB, unsigned int numBodies);
+
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDDEFS_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedTypes.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpu3DGridBroadphaseSharedTypes.h
@@ -0,0 +1,64 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared definitions for GPU-based 3D Grid collision detection broadphase
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#ifndef BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+#define BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+
+//----------------------------------------------------------------------------------------
+
+#define BT_3DGRID_PAIR_FOUND_FLG (0x40000000)
+#define BT_3DGRID_PAIR_NEW_FLG   (0x20000000)
+#define BT_3DGRID_PAIR_ANY_FLG   (BT_3DGRID_PAIR_FOUND_FLG | BT_3DGRID_PAIR_NEW_FLG)
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGridBroadphaseParams 
+{
+	unsigned int	m_gridSizeX;
+	unsigned int	m_gridSizeY;
+	unsigned int	m_gridSizeZ;
+	unsigned int	m_numCells;
+	float			m_invCellSizeX;
+	float			m_invCellSizeY;
+	float			m_invCellSizeZ;
+	unsigned int	m_numBodies;
+	unsigned int	m_maxBodiesPerCell;
+};
+
+//----------------------------------------------------------------------------------------
+
+struct bt3DGrid3F1U
+{
+	float			fx;
+	float			fy;
+	float			fz;
+	unsigned int	uw;
+};
+
+//----------------------------------------------------------------------------------------
+
+#endif // BTGPU3DGRIDBROADPHASESHAREDTYPES_H
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuDefines.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuDefines.h
@@ -0,0 +1,211 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+
+// definitions for "GPU on CPU" code
+
+
+#ifndef BT_GPU_DEFINES_H
+#define BT_GPU_DEFINES_H
+
+typedef unsigned int uint;
+
+struct int2
+{
+	int x, y;
+};
+
+struct uint2
+{
+	unsigned int x, y;
+};
+
+struct int3
+{
+	int x, y, z;
+};
+
+struct uint3
+{
+	unsigned int x, y, z;
+};
+
+struct float4
+{
+	float x, y, z, w;
+};
+
+struct float3
+{
+	float x, y, z;
+};
+
+
+#define BT_GPU___device__ inline
+#define BT_GPU___devdata__
+#define BT_GPU___constant__
+#define BT_GPU_max(a, b) ((a) > (b) ? (a) : (b))
+#define BT_GPU_min(a, b) ((a) < (b) ? (a) : (b))
+#define BT_GPU_params s3DGridBroadphaseParams
+#define BT_GPU___mul24(a, b) ((a)*(b))
+#define BT_GPU___global__ inline
+#define BT_GPU___shared__ static
+#define BT_GPU___syncthreads()
+#define CUDART_PI_F SIMD_PI
+
+static inline uint2 bt3dGrid_make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+#define BT_GPU_make_uint2(x, y) bt3dGrid_make_uint2(x, y)
+
+static inline int3 bt3dGrid_make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_int3(x, y, z) bt3dGrid_make_int3(x, y, z)
+
+static inline float3 bt3dGrid_make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+#define BT_GPU_make_float3(x, y, z) bt3dGrid_make_float3(x, y, z)
+
+static inline float3 bt3dGrid_make_float34(float4 f)
+{
+  float3 t; t.x = f.x; t.y = f.y; t.z = f.z; return t;
+}
+#define BT_GPU_make_float34(f) bt3dGrid_make_float34(f)
+
+static inline float3 bt3dGrid_make_float31(float f)
+{
+  float3 t; t.x = t.y = t.z = f; return t;
+}
+#define BT_GPU_make_float31(x) bt3dGrid_make_float31(x)
+
+static inline float4 bt3dGrid_make_float42(float3 v, float f)
+{
+  float4 t; t.x = v.x; t.y = v.y; t.z = v.z; t.w = f; return t;
+}
+#define BT_GPU_make_float42(a, b) bt3dGrid_make_float42(a, b) 
+
+static inline float4 bt3dGrid_make_float44(float a, float b, float c, float d)
+{
+  float4 t; t.x = a; t.y = b; t.z = c; t.w = d; return t;
+}
+#define BT_GPU_make_float44(a, b, c, d) bt3dGrid_make_float44(a, b, c, d) 
+
+inline int3 operator+(int3 a, int3 b)
+{
+    return bt3dGrid_make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+inline float4 operator+(const float4& a, const float4& b)
+{
+	float4 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; r.w = a.w+b.w; return r;
+}
+inline float4 operator*(const float4& a, float fact)
+{
+	float4 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; r.w = a.w*fact; return r;
+}
+inline float4 operator*(float fact, float4& a)
+{
+	return (a * fact);
+}
+inline float4& operator*=(float4& a, float fact)
+{
+	a = fact * a;
+	return a;
+}
+inline float4& operator+=(float4& a, const float4& b)
+{
+	a = a + b;
+	return a;
+}
+
+inline float3 operator+(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x+b.x; r.y = a.y+b.y; r.z = a.z+b.z; return r;
+}
+inline float3 operator-(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.x-b.x; r.y = a.y-b.y; r.z = a.z-b.z; return r;
+}
+static inline float bt3dGrid_dot(float3& a, float3& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z;
+}
+#define BT_GPU_dot(a,b) bt3dGrid_dot(a,b)
+
+static inline float bt3dGrid_dot4(float4& a, float4& b)
+{
+	return a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
+}
+#define BT_GPU_dot4(a,b) bt3dGrid_dot4(a,b)
+
+static inline float3 bt3dGrid_cross(const float3& a, const float3& b)
+{
+	float3 r; r.x = a.y*b.z-a.z*b.y; r.y = -a.x*b.z+a.z*b.x; r.z = a.x*b.y-a.y*b.x;	return r;
+}
+#define BT_GPU_cross(a,b) bt3dGrid_cross(a,b)
+
+
+inline float3 operator*(const float3& a, float fact)
+{
+	float3 r; r.x = a.x*fact; r.y = a.y*fact; r.z = a.z*fact; return r;
+}
+
+
+inline float3& operator+=(float3& a, const float3& b)
+{
+	a = a + b;
+	return a;
+}
+inline float3& operator-=(float3& a, const float3& b)
+{
+	a = a - b;
+	return a;
+}
+inline float3& operator*=(float3& a, float fact)
+{
+	a = a * fact;
+	return a;
+}
+inline float3 operator-(const float3& v)
+{
+	float3 r; r.x = -v.x; r.y = -v.y; r.z = -v.z; return r;
+}
+
+
+#define BT_GPU_FETCH(a, b) a[b]
+#define BT_GPU_FETCH4(a, b) a[b]
+#define BT_GPU_PREF(func) btGpu_##func
+#define BT_GPU_SAFE_CALL(func) func
+#define BT_GPU_Memset memset
+#define BT_GPU_MemcpyToSymbol(a, b, c) memcpy(&a, b, c)
+#define BT_GPU_BindTexture(a, b, c, d)
+#define BT_GPU_UnbindTexture(a)
+
+static uint2 s_blockIdx, s_blockDim, s_threadIdx;
+#define BT_GPU_blockIdx s_blockIdx
+#define BT_GPU_blockDim s_blockDim
+#define BT_GPU_threadIdx s_threadIdx
+#define BT_GPU_EXECKERNEL(numb, numt, kfunc, args) {s_blockDim.x=numt;for(int nb=0;nb<numb;nb++){s_blockIdx.x=nb;for(int nt=0;nt<numt;nt++){s_threadIdx.x=nt;kfunc args;}}}
+
+#define BT_GPU_CHECK_ERROR(s)
+
+
+#endif //BT_GPU_DEFINES_H
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedCode.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedCode.h
@@ -0,0 +1,55 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2009 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+//----------------------------------------------------------------------------------------
+
+// Shared code for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  will be compiled by both CPU and CUDA compilers
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+//----------------------------------------------------------------------------------------
+
+#include "btGpuUtilsSharedDefs.h"
+
+//----------------------------------------------------------------------------------------
+
+extern "C"
+{
+
+//----------------------------------------------------------------------------------------
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b)
+{
+    return (a % b != 0) ? (a / b + 1) : (a / b);
+} // iDivUp()
+
+//----------------------------------------------------------------------------------------
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads)
+{
+    numThreads = BT_GPU_min(blockSize, n);
+    numBlocks = BT_GPU_PREF(iDivUp)(n, numThreads);
+} // computeGridSize()
+
+//----------------------------------------------------------------------------------------
+
+} // extern "C"
+
--- a/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedDefs.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/3dGridBroadphase/Shared/btGpuUtilsSharedDefs.h
@@ -0,0 +1,52 @@
+/*
+Bullet Continuous Collision Detection and Physics Library, http://bulletphysics.org
+Copyright (C) 2006, 2007 Sony Computer Entertainment Inc. 
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+// Shared definitions for GPU-based utilities
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//  Keep this file free from Bullet headers
+//  it is included into both CUDA and CPU code
+//	file with definitions of BT_GPU_xxx should be included first
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+#ifndef BTGPUUTILSDHAREDDEFS_H
+#define BTGPUUTILSDHAREDDEFS_H
+
+
+extern "C"
+{
+
+
+//Round a / b to nearest higher integer value
+int BT_GPU_PREF(iDivUp)(int a, int b);
+
+// compute grid and thread block size for a given number of elements
+void BT_GPU_PREF(computeGridSize)(int n, int blockSize, int &numBlocks, int &numThreads);
+
+void BT_GPU_PREF(allocateArray)(void** devPtr, unsigned int size);
+void BT_GPU_PREF(freeArray)(void* devPtr);
+void BT_GPU_PREF(copyArrayFromDevice)(void* host, const void* device, unsigned int size);
+void BT_GPU_PREF(copyArrayToDevice)(void* device, const void* host, unsigned int size);
+void BT_GPU_PREF(registerGLBufferObject(unsigned int vbo));
+void* BT_GPU_PREF(mapGLBufferObject(unsigned int vbo));
+void BT_GPU_PREF(unmapGLBufferObject(unsigned int vbo));
+
+
+} // extern "C"
+
+
+#endif // BTGPUUTILSDHAREDDEFS_H
+