add gpu_broadphase with basic test

2013-03-12 13:47:13 -07:00
parent 08272c7de5
commit 477a7f9e39
11 changed files with 1705 additions and 2 deletions
--- a/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
+++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
@@ -0,0 +1,485 @@
+
+#include "btGpuSapBroadphase.h"
+#include "parallel_primitives/host/btVector3.h"
+#include "parallel_primitives/host/btLauncherCL.h"
+#include "parallel_primitives/host/btQuickprof.h"
+#include "../basic_initialize/btOpenCLUtils.h"
+
+#include "../kernels/integrateKernel.h"
+
+#include "../kernels/sapKernels.h"
+#include "../kernels/sapFastKernels.h"
+#include "parallel_primitives/host/btMinMax.h"
+
+
+btGpuSapBroadphase::btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q )
+:m_context(ctx),
+m_device(device),
+m_queue(q),
+m_allAabbsGPU(ctx,q),
+m_smallAabbsGPU(ctx,q),
+m_largeAabbsGPU(ctx,q),
+m_overlappingPairs(ctx,q),
+m_gpuSmallSortData(ctx,q),
+m_gpuSmallSortedAabbs(ctx,q)
+{
+	const char* sapSrc = sapCL;
+    const char* sapFastSrc = sapFastCL;
+    
+	cl_int errNum=0;
+
+	cl_program sapProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"","opencl/gpu_broadphase/kernels/sap.cl");
+	btAssert(errNum==CL_SUCCESS);
+	cl_program sapFastProg = btOpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"","opencl/gpu_broadphase/kernels/sapFast.cl");
+	btAssert(errNum==CL_SUCCESS);
+
+	
+	//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
+	//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
+	//m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+
+	
+	m_sap2Kernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
+	btAssert(errNum==CL_SUCCESS);
+
+#if 0
+
+	m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
+	btAssert(errNum==CL_SUCCESS);
+#else
+#ifndef __APPLE__
+	m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
+	btAssert(errNum==CL_SUCCESS);
+#else
+	m_sapKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+	btAssert(errNum==CL_SUCCESS);
+#endif
+#endif
+
+	m_flipFloatKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
+
+	m_copyAabbsKernel= btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
+
+	m_scatterKernel = btOpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
+
+	m_sorter = new btRadixSort32CL(m_context,m_device,m_queue);
+}
+
+btGpuSapBroadphase::~btGpuSapBroadphase()
+{
+	delete m_sorter;
+	clReleaseKernel(m_scatterKernel);
+	clReleaseKernel(m_flipFloatKernel);
+	clReleaseKernel(m_copyAabbsKernel);
+	clReleaseKernel(m_sapKernel);
+	clReleaseKernel(m_sap2Kernel);
+
+}
+
+/// conservative test for overlap between two aabbs
+static bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aabbMax1,
+								const btVector3 &aabbMin2, const btVector3 &aabbMax2)
+{
+	bool overlap = true;
+	overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
+	overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
+	overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
+	return overlap;
+}
+
+void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
+{
+	int axis = 0;//todo on GPU for now hardcode
+
+	btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
+	
+
+	if (forceHost)
+	{
+
+	btAlignedObjectArray<btSapAabb> allHostAabbs;
+	m_allAabbsGPU.copyToHost(allHostAabbs);
+	
+	{
+		int numSmallAabbs = m_smallAabbsCPU.size();
+		for (int j=0;j<numSmallAabbs;j++)
+		{
+			//sync aabb
+			int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
+			m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
+			m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+		}
+	}
+
+	{
+		int numLargeAabbs = m_largeAabbsCPU.size();
+		for (int j=0;j<numLargeAabbs;j++)
+		{
+			//sync aabb
+			int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
+			m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
+			m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+
+		}
+	}
+
+	btAlignedObjectArray<btInt2> hostPairs;
+
+	{
+		int numSmallAabbs = m_smallAabbsCPU.size();
+		for (int i=0;i<numSmallAabbs;i++)
+		{
+			float reference = m_smallAabbsCPU[i].m_max[axis];
+
+			for (int j=i+1;j<numSmallAabbs;j++)
+			{
+				if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
+					(btVector3&)m_smallAabbsCPU[j].m_min,(btVector3&)m_smallAabbsCPU[j].m_max))
+				{
+					btInt2 pair;
+					pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
+					pair.y = m_smallAabbsCPU[j].m_minIndices[3];
+					hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+
+	
+	{
+		int numSmallAabbs = m_smallAabbsCPU.size();
+		for (int i=0;i<numSmallAabbs;i++)
+		{
+			float reference = m_smallAabbsCPU[i].m_max[axis];
+			int numLargeAabbs = m_largeAabbsCPU.size();
+
+			for (int j=0;j<numLargeAabbs;j++)
+			{
+				if (TestAabbAgainstAabb2((btVector3&)m_smallAabbsCPU[i].m_min, (btVector3&)m_smallAabbsCPU[i].m_max,
+					(btVector3&)m_largeAabbsCPU[j].m_min,(btVector3&)m_largeAabbsCPU[j].m_max))
+				{
+					btInt2 pair;
+					pair.x = m_largeAabbsCPU[j].m_minIndices[3];
+					pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
+					hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+
+
+	if (hostPairs.size())
+	{
+		m_overlappingPairs.copyFromHost(hostPairs);
+	} else
+	{
+		m_overlappingPairs.resize(0);
+	}
+
+	return;
+	}
+
+	{
+
+	bool syncOnHost = false;
+
+	if (syncOnHost)
+	{
+		BT_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
+		btAlignedObjectArray<btSapAabb> allHostAabbs;
+		m_allAabbsGPU.copyToHost(allHostAabbs);
+
+		m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
+		{
+			int numSmallAabbs = m_smallAabbsCPU.size();
+			for (int j=0;j<numSmallAabbs;j++)
+			{
+				//sync aabb
+				int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
+				m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
+				m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+			}
+		}
+		m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
+	
+	} else
+	{
+		{
+			int numSmallAabbs = m_smallAabbsGPU.size();
+			BT_PROFILE("copyAabbsKernelSmall");
+			btBufferInfoCL bInfo[] = { 
+				btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), 
+				btBufferInfoCL( m_smallAabbsGPU.getBufferCL()),
+			};
+
+			btLauncherCL launcher(m_queue, m_copyAabbsKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( numSmallAabbs  );
+			int num = numSmallAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		}
+	}
+
+	if (syncOnHost)
+	{
+		BT_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
+		btAlignedObjectArray<btSapAabb> allHostAabbs;
+		m_allAabbsGPU.copyToHost(allHostAabbs);
+
+		m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
+		{
+			int numLargeAabbs = m_largeAabbsCPU.size();
+			for (int j=0;j<numLargeAabbs;j++)
+			{
+				//sync aabb
+				int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
+				m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
+				m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+			}
+		}
+		m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
+	
+	} else
+	{
+		int numLargeAabbs = m_largeAabbsGPU.size();
+		
+		if (numLargeAabbs)
+		{
+			BT_PROFILE("copyAabbsKernelLarge");
+			btBufferInfoCL bInfo[] = { 
+				btBufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), 
+				btBufferInfoCL( m_largeAabbsGPU.getBufferCL()),
+			};
+
+			btLauncherCL launcher(m_queue, m_copyAabbsKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( numLargeAabbs  );
+			int num = numLargeAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		}
+	}
+
+
+
+
+		BT_PROFILE("GPU SAP");
+		
+		int numSmallAabbs = m_smallAabbsGPU.size();
+		m_gpuSmallSortData.resize(numSmallAabbs);
+		int numLargeAabbs = m_smallAabbsGPU.size();
+
+#if 1
+		if (m_smallAabbsGPU.size())
+		{
+			BT_PROFILE("flipFloatKernel");
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL())};
+			btLauncherCL launcher(m_queue, m_flipFloatKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( numSmallAabbs  );
+			launcher.setConst( axis  );
+			
+			int num = numSmallAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		}
+
+		{
+			BT_PROFILE("gpu radix sort\n");
+			m_sorter->execute(m_gpuSmallSortData);
+			clFinish(m_queue);
+		}
+
+		m_gpuSmallSortedAabbs.resize(numSmallAabbs);
+		if (numSmallAabbs)
+		{
+			BT_PROFILE("scatterKernel");
+			btBufferInfoCL bInfo[] = { btBufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), btBufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),btBufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
+			btLauncherCL launcher(m_queue, m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			launcher.setConst( numSmallAabbs);
+			int num = numSmallAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+			
+		}
+        
+
+			int maxPairsPerBody = 64;
+			int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
+			m_overlappingPairs.resize(maxPairs);
+
+			btOpenCLArray<int> pairCount(m_context, m_queue);
+			pairCount.push_back(0);
+            int numPairs=0;
+
+			{
+				int numLargeAabbs = m_largeAabbsGPU.size();
+				if (numLargeAabbs && numSmallAabbs)
+				{
+					BT_PROFILE("sap2Kernel");
+					btBufferInfoCL bInfo[] = { btBufferInfoCL( m_largeAabbsGPU.getBufferCL() ),btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
+					btLauncherCL launcher(m_queue, m_sap2Kernel);
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+					launcher.setConst(   numLargeAabbs  );
+					launcher.setConst( numSmallAabbs);
+					launcher.setConst( axis  );
+					launcher.setConst( maxPairs  );
+//@todo: use actual maximum work item sizes of the device instead of hardcoded values
+					launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
+                
+					numPairs = pairCount.at(0);
+					if (numPairs >maxPairs)
+						numPairs =maxPairs;
+					
+				}
+			}
+			if (m_gpuSmallSortedAabbs.size())
+			{
+				BT_PROFILE("sapKernel");
+				btBufferInfoCL bInfo[] = { btBufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), btBufferInfoCL( m_overlappingPairs.getBufferCL() ), btBufferInfoCL(pairCount.getBufferCL())};
+				btLauncherCL launcher(m_queue, m_sapKernel);
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+				launcher.setConst( numSmallAabbs  );
+				launcher.setConst( axis  );
+				launcher.setConst( maxPairs  );
+
+			
+				int num = numSmallAabbs;
+#if 0                
+                int buffSize = launcher.getSerializationBufferSize();
+                unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
+                for (int i=0;i<buffSize+1;i++)
+                {
+                    unsigned char* ptr = (unsigned char*)&buf[i];
+                    *ptr = 0xff;
+                }
+                int actualWrite = launcher.serializeArguments(buf,buffSize);
+                
+                unsigned char* cptr = (unsigned char*)&buf[buffSize];
+    //            printf("buf[buffSize] = %d\n",*cptr);
+                
+                assert(buf[buffSize]==0xff);//check for buffer overrun
+                int* ptr = (int*)&buf[buffSize];
+                
+                *ptr = num;
+                
+                FILE* f = fopen("m_sapKernelArgs.bin","wb");
+                fwrite(buf,buffSize+sizeof(int),1,f);
+                fclose(f);
+#endif//
+
+                launcher.launch1D( num);
+				clFinish(m_queue);
+                
+                numPairs = pairCount.at(0);
+                if (numPairs>maxPairs)
+					numPairs = maxPairs;
+			}
+			
+#else
+        int numPairs = 0;
+        
+        
+        btLauncherCL launcher(m_queue, m_sapKernel);
+
+        const char* fileName = "m_sapKernelArgs.bin";
+        FILE* f = fopen(fileName,"rb");
+        if (f)
+        {
+            int sizeInBytes=0;
+            if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) 
+            {
+                printf("error, cannot get file size\n");
+                exit(0);
+            }
+            
+            unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
+            fread(buf,sizeInBytes,1,f);
+            int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
+            int num = *(int*)&buf[serializedBytes];
+            launcher.launch1D( num);
+            
+            btOpenCLArray<int> pairCount(m_context, m_queue);
+            int numElements = launcher.m_arrays[2]->size()/sizeof(int);
+            pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
+            numPairs = pairCount.at(0);
+            //printf("overlapping pairs = %d\n",numPairs);
+            btAlignedObjectArray<btInt2>		hostOoverlappingPairs;
+            btOpenCLArray<btInt2> tmpGpuPairs(m_context,m_queue);
+            tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
+   
+            tmpGpuPairs.copyToHost(hostOoverlappingPairs);
+            m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
+            //printf("hello %d\n", m_overlappingPairs.size());
+            free(buf);
+            fclose(f);
+            
+        } else {
+            printf("error: cannot find file %s\n",fileName);
+        }
+        
+        clFinish(m_queue);
+
+        
+#endif
+
+			
+        m_overlappingPairs.resize(numPairs);
+		
+	}//BT_PROFILE("GPU_RADIX SORT");
+
+}
+
+void btGpuSapBroadphase::writeAabbsToGpu()
+{
+	m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
+	m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
+	m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
+
+}
+
+void btGpuSapBroadphase::createLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+{
+	int index = userPtr;
+	btSapAabb aabb;
+	for (int i=0;i<4;i++)
+	{
+		aabb.m_min[i] = aabbMin[i];
+		aabb.m_max[i] = aabbMax[i];
+	}
+	aabb.m_minIndices[3] = index;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
+	m_largeAabbsCPU.push_back(aabb);
+	m_allAabbsCPU.push_back(aabb);
+}
+
+void btGpuSapBroadphase::createProxy(const btVector3& aabbMin,  const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+{
+	int index = userPtr;
+	btSapAabb aabb;
+	for (int i=0;i<4;i++)
+	{
+		aabb.m_min[i] = aabbMin[i];
+		aabb.m_max[i] = aabbMax[i];
+	}
+	aabb.m_minIndices[3] = index;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
+	m_smallAabbsCPU.push_back(aabb);
+	m_allAabbsCPU.push_back(aabb);
+}
+
+cl_mem	btGpuSapBroadphase::getAabbBuffer()
+{
+	return m_allAabbsGPU.getBufferCL();
+}
+
+int	btGpuSapBroadphase::getNumOverlap()
+{
+	return m_overlappingPairs.size();
+}
+cl_mem	btGpuSapBroadphase::getOverlappingPairBuffer()
+{
+	return m_overlappingPairs.getBufferCL();
+}
--- a/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
+++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
@@ -0,0 +1,74 @@
+#ifndef BT_GPU_SAP_BROADPHASE_H
+#define BT_GPU_SAP_BROADPHASE_H
+
+#include "parallel_primitives/host/btOpenCLArray.h"
+#include "parallel_primitives/host/btFillCL.h" //btInt2
+class btVector3;
+#include "parallel_primitives/host/btRadixSort32CL.h"
+
+struct btSapAabb
+{
+	union
+	{
+		float m_min[4];
+		int m_minIndices[4];
+	};
+	union
+	{
+		float m_max[4];
+		int m_signedMaxIndices[4];
+		//unsigned int m_unsignedMaxIndices[4];
+	};
+};
+
+
+
+class btGpuSapBroadphase
+{
+	
+	cl_context				m_context;
+	cl_device_id			m_device;
+	cl_command_queue		m_queue;
+	cl_kernel				m_flipFloatKernel;
+	cl_kernel				m_scatterKernel ;
+	cl_kernel				m_copyAabbsKernel;
+	cl_kernel				m_sapKernel;
+	cl_kernel				m_sap2Kernel;
+
+	class btRadixSort32CL* m_sorter;
+
+	public:
+	
+	btOpenCLArray<btSapAabb>	m_allAabbsGPU;
+	btAlignedObjectArray<btSapAabb>	m_allAabbsCPU;
+
+	btOpenCLArray<btSapAabb>	m_smallAabbsGPU;
+	btAlignedObjectArray<btSapAabb>	m_smallAabbsCPU;
+
+	btOpenCLArray<btSapAabb>	m_largeAabbsGPU;
+	btAlignedObjectArray<btSapAabb>	m_largeAabbsCPU;
+
+	btOpenCLArray<btInt2>		m_overlappingPairs;
+
+	//temporary gpu work memory
+	btOpenCLArray<btSortData>	m_gpuSmallSortData;
+	btOpenCLArray<btSapAabb>	m_gpuSmallSortedAabbs;
+
+
+	btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	virtual ~btGpuSapBroadphase();
+	
+	void  calculateOverlappingPairs(bool forceHost=false);
+
+	void createProxy(const btVector3& aabbMin,  const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+	void createLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	void writeAabbsToGpu();
+
+	cl_mem	getAabbBuffer();
+	int	getNumOverlap();
+	cl_mem	getOverlappingPairBuffer();
+};
+
+#endif //BT_GPU_SAP_BROADPHASE_H
--- a/opencl/gpu_broadphase/kernels/sap.cl
+++ b/opencl/gpu_broadphase/kernels/sap.cl
@@ -0,0 +1,320 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+
+/// conservative test for overlap between two aabbs
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+
+__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	if (i>=numUnsortedAabbs)
+		return;
+
+	int j = get_global_id(1);
+	if (j>=numSortedAabbs)
+		return;
+
+	if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
+	{
+		int2 myPair;
+		
+		myPair.x = unsortedAabbs[i].m_minIndices[3];
+		myPair.y = sortedAabbs[j].m_minIndices[3];
+
+		int curPair = atomic_inc (pairCount);
+		if (curPair<maxPairs)
+		{
+				pairsOut[curPair] = myPair; //flush to main memory
+		}
+	}
+}
+
+__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	for (int j=i+1;j<numObjects;j++)
+	{
+  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+		{
+			break;
+		}
+		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+		{
+			int2 myPair;
+			myPair.x = aabbs[i].m_minIndices[3];
+			myPair.y = aabbs[j].m_minIndices[3];
+			int curPair = atomic_inc (pairCount);
+			if (curPair<maxPairs)
+			{
+					pairsOut[curPair] = myPair; //flush to main memory
+			}
+		}
+	}
+}
+
+
+
+
+__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+			{
+				int2 myPair;
+				myPair.x = aabbs[i].m_minIndices[3];
+				myPair.y = aabbs[j].m_minIndices[3];
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		j++;
+
+	} while (breakRequest[0]<numActiveWgItems[0]);
+}
+
+
+__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	__local btAabbCL localAabbs[128];// = aabbs[i];
+	
+	btAabbCL myAabb;
+	
+	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
+	float testValue = 	myAabb.m_maxElems[axis];
+	
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	int localCount=0;
+	int block=0;
+	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
+	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+	
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
+			{
+				int2 myPair;
+				myPair.x = myAabb.m_minIndices[3];
+				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		localCount++;
+		if (localCount==64)
+		{
+			localCount = 0;
+			block+=64;			
+			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
+			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
+		}
+		j++;
+		
+	} while (breakRequest[0]<numActiveWgItems[0]);
+	
+}
+
+
+
+
+//http://stereopsis.com/radix.html
+unsigned int FloatFlip(float fl);
+unsigned int FloatFlip(float fl)
+{
+	unsigned int f = *(unsigned int*)&fl;
+	unsigned int mask = -(int)(f >> 31) | 0x80000000;
+	return f ^ mask;
+}
+float IFloatFlip(unsigned int f);
+float IFloatFlip(unsigned int f)
+{
+	unsigned int mask = ((f >> 31) - 1) | 0x80000000;
+	unsigned int fl = f ^ mask;
+	return *(float*)&fl;
+}
+
+
+
+
+__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	int src = destAabbs[i].m_maxIndices[3];
+	destAabbs[i] = allAabbs[src];
+	destAabbs[i].m_maxIndices[3] = src;
+}
+
+
+__kernel void   flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+		
+		sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
+		sortData[i].y = i;
+		
+}
+
+
+__kernel void   scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+
+		sortedAabbs[i] = aabbs[sortData[i].y];
+}
--- a/opencl/gpu_broadphase/kernels/sapFast.cl
+++ b/opencl/gpu_broadphase/kernels/sapFast.cl
@@ -0,0 +1,161 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+
+/// conservative test for overlap between two aabbs
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
+{
+//skip pairs between static (mass=0) objects
+	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
+		return false;
+		
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+
+//computePairsKernelBatchWrite
+__kernel void   computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	__local btAabbCL localAabbs[128];// = aabbs[i];
+	
+	int2 myPairs[64];
+	
+	btAabbCL myAabb;
+	
+	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
+	float testValue = 	myAabb.m_maxElems[axis];
+	
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	int localCount=0;
+	int block=0;
+	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
+	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+	int curNumPairs = 0;
+	
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
+			{
+				int2 myPair;
+				myPair.x = myAabb.m_minIndices[3];
+				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
+				myPairs[curNumPairs] = myPair;
+				curNumPairs++;
+				if (curNumPairs==64)
+				{
+					int curPair = atomic_add(pairCount,curNumPairs);
+					//avoid a buffer overrun
+					if ((curPair+curNumPairs)<maxPairs)
+					{
+						for (int p=0;p<curNumPairs;p++)
+						{
+							pairsOut[curPair+p] = myPairs[p]; //flush to main memory
+						}
+					}
+					curNumPairs = 0;
+				}
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		localCount++;
+		if (localCount==64)
+		{
+			localCount = 0;
+			block+=64;			
+			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
+			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
+		}
+		j++;
+		
+	} while (breakRequest[0]<numActiveWgItems[0]);
+	
+	
+	if (curNumPairs>0)
+	{
+		//avoid a buffer overrun
+		int curPair = atomic_add(pairCount,curNumPairs);
+		if ((curPair+curNumPairs)<maxPairs)
+		{
+			for (int p=0;p<curNumPairs;p++)
+			{
+					pairsOut[curPair+p] = myPairs[p]; //flush to main memory
+			}
+		}
+		curNumPairs = 0;
+	}
+}
--- a/opencl/gpu_broadphase/kernels/sapFastKernels.h
+++ b/opencl/gpu_broadphase/kernels/sapFastKernels.h
@@ -0,0 +1,164 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* sapFastCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Erwin Coumans\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"\n"
+"\n"
+"/// conservative test for overlap between two aabbs\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
+"{\n"
+"//skip pairs between static (mass=0) objects\n"
+"	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
+"		return false;\n"
+"		\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"\n"
+"//computePairsKernelBatchWrite\n"
+"__kernel void   computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
+"	\n"
+"	int2 myPairs[64];\n"
+"	\n"
+"	btAabbCL myAabb;\n"
+"	\n"
+"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+"	float testValue = 	myAabb.m_maxElems[axis];\n"
+"	\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	int localCount=0;\n"
+"	int block=0;\n"
+"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+"	\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	int curNumPairs = 0;\n"
+"	\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = myAabb.m_minIndices[3];\n"
+"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+"				myPairs[curNumPairs] = myPair;\n"
+"				curNumPairs++;\n"
+"				if (curNumPairs==64)\n"
+"				{\n"
+"					int curPair = atomic_add(pairCount,curNumPairs);\n"
+"					//avoid a buffer overrun\n"
+"					if ((curPair+curNumPairs)<maxPairs)\n"
+"					{\n"
+"						for (int p=0;p<curNumPairs;p++)\n"
+"						{\n"
+"							pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
+"						}\n"
+"					}\n"
+"					curNumPairs = 0;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		localCount++;\n"
+"		if (localCount==64)\n"
+"		{\n"
+"			localCount = 0;\n"
+"			block+=64;			\n"
+"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+"		}\n"
+"		j++;\n"
+"		\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"	\n"
+"	\n"
+"	if (curNumPairs>0)\n"
+"	{\n"
+"		//avoid a buffer overrun\n"
+"		int curPair = atomic_add(pairCount,curNumPairs);\n"
+"		if ((curPair+curNumPairs)<maxPairs)\n"
+"		{\n"
+"			for (int p=0;p<curNumPairs;p++)\n"
+"			{\n"
+"					pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"		curNumPairs = 0;\n"
+"	}\n"
+"}\n"
+;
--- a/opencl/gpu_broadphase/kernels/sapKernels.h
+++ b/opencl/gpu_broadphase/kernels/sapKernels.h
@@ -0,0 +1,324 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* sapCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Erwin Coumans\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} btAabbCL;\n"
+"\n"
+"\n"
+"/// conservative test for overlap between two aabbs\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numUnsortedAabbs)\n"
+"		return;\n"
+"\n"
+"	int j = get_global_id(1);\n"
+"	if (j>=numSortedAabbs)\n"
+"		return;\n"
+"\n"
+"	if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
+"	{\n"
+"		int2 myPair;\n"
+"		\n"
+"		myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
+"		myPair.y = sortedAabbs[j].m_minIndices[3];\n"
+"\n"
+"		int curPair = atomic_inc (pairCount);\n"
+"		if (curPair<maxPairs)\n"
+"		{\n"
+"				pairsOut[curPair] = myPair; //flush to main memory\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	for (int j=i+1;j<numObjects;j++)\n"
+"	{\n"
+"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"		{\n"
+"			break;\n"
+"		}\n"
+"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"		{\n"
+"			int2 myPair;\n"
+"			myPair.x = aabbs[i].m_minIndices[3];\n"
+"			myPair.y = aabbs[j].m_minIndices[3];\n"
+"			int curPair = atomic_inc (pairCount);\n"
+"			if (curPair<maxPairs)\n"
+"			{\n"
+"					pairsOut[curPair] = myPair; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = aabbs[i].m_minIndices[3];\n"
+"				myPair.y = aabbs[j].m_minIndices[3];\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		j++;\n"
+"\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	__local btAabbCL localAabbs[128];// = aabbs[i];\n"
+"	\n"
+"	btAabbCL myAabb;\n"
+"	\n"
+"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+"	float testValue = 	myAabb.m_maxElems[axis];\n"
+"	\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	int localCount=0;\n"
+"	int block=0;\n"
+"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+"	\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = myAabb.m_minIndices[3];\n"
+"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+"		localCount++;\n"
+"		if (localCount==64)\n"
+"		{\n"
+"			localCount = 0;\n"
+"			block+=64;			\n"
+"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+"		}\n"
+"		j++;\n"
+"		\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"	\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"//http://stereopsis.com/radix.html\n"
+"unsigned int FloatFlip(float fl);\n"
+"unsigned int FloatFlip(float fl)\n"
+"{\n"
+"	unsigned int f = *(unsigned int*)&fl;\n"
+"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
+"	return f ^ mask;\n"
+"}\n"
+"float IFloatFlip(unsigned int f);\n"
+"float IFloatFlip(unsigned int f)\n"
+"{\n"
+"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
+"	unsigned int fl = f ^ mask;\n"
+"	return *(float*)&fl;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	int src = destAabbs[i].m_maxIndices[3];\n"
+"	destAabbs[i] = allAabbs[src];\n"
+"	destAabbs[i].m_maxIndices[3] = src;\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"		\n"
+"		sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
+"		sortData[i].y = i;\n"
+"		\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"\n"
+"		sortedAabbs[i] = aabbs[sortData[i].y];\n"
+"}\n"
+"\n"
+;
--- a/opencl/gpu_broadphase/test/main.cpp
+++ b/opencl/gpu_broadphase/test/main.cpp
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+
+
+#include <stdio.h>
+#include "../basic_initialize/btOpenCLUtils.h"
+#include "../host/btGpuSapBroadphase.h"
+#include "parallel_primitives/host/btVector3.h"
+#include "parallel_primitives/host/btFillCL.h"
+#include "parallel_primitives/host/btBoundSearchCL.h"
+#include "parallel_primitives/host/btRadixSort32CL.h"
+#include "parallel_primitives/host/btPrefixScanCL.h"
+#include "parallel_primitives/host/CommandLineArgs.h"
+
+#include "parallel_primitives/host/btMinMax.h"
+int g_nPassed = 0;
+int g_nFailed = 0;
+bool g_testFailed = 0;
+
+#define TEST_INIT g_testFailed = 0;
+#define TEST_ASSERT(x) if( !(x) ){g_testFailed = 1;}
+#define TEST_REPORT(testName) printf("[%s] %s\n",(g_testFailed)?"X":"O", testName); if(g_testFailed) g_nFailed++; else g_nPassed++;
+#define NEXTMULTIPLEOF(num, alignment) (((num)/(alignment) + (((num)%(alignment)==0)?0:1))*(alignment))
+
+cl_context g_context=0;
+cl_device_id g_device=0;
+cl_command_queue g_queue =0;
+const char* g_deviceName = 0;
+
+void initCL(int preferredDeviceIndex, int preferredPlatformIndex)
+{
+	void* glCtx=0;
+	void* glDC = 0;
+	int ciErrNum = 0;
+	//bound search and radix sort only work on GPU right now (assume 32 or 64 width workgroup without barriers)
+
+	cl_device_type deviceType = CL_DEVICE_TYPE_ALL;
+
+	g_context = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0,0,preferredDeviceIndex, preferredPlatformIndex);
+	oclCHECKERROR(ciErrNum, CL_SUCCESS);
+	int numDev = btOpenCLUtils::getNumDevices(g_context);
+	if (numDev>0)
+	{
+		btOpenCLDeviceInfo info;
+		g_device= btOpenCLUtils::getDevice(g_context,0);
+		g_queue = clCreateCommandQueue(g_context, g_device, 0, &ciErrNum);
+		oclCHECKERROR(ciErrNum, CL_SUCCESS);
+        btOpenCLUtils::printDeviceInfo(g_device);
+		btOpenCLUtils::getDeviceInfo(g_device,&info);
+		g_deviceName = info.m_deviceName;
+	}
+}
+
+void exitCL()
+{
+	clReleaseCommandQueue(g_queue);
+	clReleaseContext(g_context);
+}
+
+
+inline void broadphaseTest()
+{
+	TEST_INIT;
+
+	btGpuSapBroadphase* sap = new btGpuSapBroadphase(g_context,g_device,g_queue);
+	int group=1;
+	int mask=1;
+	btVector3 aabbMin(0,0,0);
+	btVector3 aabbMax(1,1,1);
+	int usrPtr = 1;
+	sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
+
+//	aabbMin.setValue(2,2,2);
+//	aabbMax.setValue(3,3,3);
+	usrPtr = 2;
+	sap->createProxy(aabbMin,aabbMax,usrPtr,group,mask);
+	sap->writeAabbsToGpu();
+
+	sap->calculateOverlappingPairs();
+	
+	int numOverlap = sap->getNumOverlap();
+	cl_mem buf = sap->getOverlappingPairBuffer();
+	
+	TEST_ASSERT(numOverlap==1);
+
+	delete sap;
+
+	TEST_REPORT( "broadphaseTest" );
+}
+
+int main(int argc, char** argv)
+{
+	int preferredDeviceIndex = -1;
+	int preferredPlatformIndex = -1;
+
+	CommandLineArgs args(argc, argv);
+	args.GetCmdLineArgument("deviceId", preferredDeviceIndex);
+	args.GetCmdLineArgument("platformId", preferredPlatformIndex);
+
+	initCL(preferredDeviceIndex,preferredPlatformIndex);
+
+
+	broadphaseTest();
+
+	printf("%d tests passed, %d tests failed\n",g_nPassed, g_nFailed);
+	printf("End, press <enter>\n");
+	getchar();
+
+	exitCL();
+
+}
+
--- a/opencl/gpu_broadphase/test/premake4.lua
+++ b/opencl/gpu_broadphase/test/premake4.lua
@@ -0,0 +1,46 @@
+function createProject(vendor)	
+	hasCL = findOpenCL(vendor)
+	
+	if (hasCL) then
+
+		project ("OpenCL_broadphase_test_" .. vendor)
+
+		initOpenCL(vendor)
+
+		language "C++"
+				
+		kind "ConsoleApp"
+		targetdir "../../../bin"
+		includedirs {"..","../.."}
+		
+		
+		files {
+			"main.cpp",
+			"../../basic_initialize/btOpenCLInclude.h",
+			"../../basic_initialize/btOpenCLUtils.cpp",
+			"../../basic_initialize/btOpenCLUtils.h",
+			"../host/btGpuSapBroadphase.cpp",
+			"../host/btGpuSapBroadphase.h",
+			"../../parallel_primitives/host/btFillCL.cpp",
+			"../../parallel_primitives/host/btFillCL.h",
+			"../../parallel_primitives/host/btBoundSearchCL.cpp",
+			"../../parallel_primitives/host/btBoundSearchCL.h",
+			"../../parallel_primitives/host/btPrefixScanCL.cpp",
+			"../../parallel_primitives/host/btPrefixScanCL.h",
+			"../../parallel_primitives/host/btRadixSort32CL.cpp",
+			"../../parallel_primitives/host/btRadixSort32CL.h",
+			"../../parallel_primitives/host/btAlignedAllocator.cpp",
+			"../../parallel_primitives/host/btAlignedAllocator.h",
+			"../../parallel_primitives/host/btAlignedObjectArray.h",
+			"../../parallel_primitives/host/btQuickprof.cpp",
+			"../../parallel_primitives/host/btQuickprof.h",
+
+		}
+		
+	end
+end
+
+createProject("AMD")
+createProject("Intel")
+createProject("NVIDIA")
+createProject("Apple")
--- a/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp
+++ b/opencl/parallel_primitives/benchmark/test_large_problem_sorting.cpp
@@ -652,6 +652,8 @@ int main( int argc, char** argv)
 	printf("Initialize OpenCL using btOpenCLUtils_createContextFromType\n");
 	cl_platform_id platformId;
 	g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_ALL, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+//	g_cxMainContext = btOpenCLUtils_createContextFromType(CL_DEVICE_TYPE_GPU, &ciErrNum, 0, 0,gPreferredDeviceId,gPreferredPlatformId,&platformId);
+	
 	oclCHECKERROR(ciErrNum, CL_SUCCESS);

 	int numDev = btOpenCLUtils_getNumDevices(g_cxMainContext);
@@ -674,7 +676,7 @@ int main( int argc, char** argv)
 	//srand(time(NULL));	
 	srand(0);				// presently deterministic

-    unsigned int num_elements 					= 32*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
+    unsigned int num_elements 					= 8*1024*1024;//4*1024*1024;//4*1024*1024;//257;//8*524288;//2048;//512;//524288;
    unsigned int iterations  					= 10;
    bool keys_only = true;