reorder files, in preparation for Bullet 3 -> Bullet 2 merge

2013-04-29 19:04:08 -07:00
parent 55b69201a9
commit 3ac332f3a7
162 changed files with 215 additions and 3070 deletions
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
@@ -0,0 +1,565 @@
+
+#include "b3GpuSapBroadphase.h"
+#include "Bullet3Common/b3Vector3.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
+#include "Bullet3Common/b3Quickprof.h"
+#include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
+#include "kernels/sapKernels.h"
+#include "kernels/sapFastKernels.h"
+#include "Bullet3Common/b3MinMax.h"
+
+#define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
+#define B3_BROADPHASE_SAPFAST_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl"
+
+b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q )
+:m_context(ctx),
+m_device(device),
+m_queue(q),
+m_allAabbsGPU(ctx,q),
+m_smallAabbsGPU(ctx,q),
+m_largeAabbsGPU(ctx,q),
+m_overlappingPairs(ctx,q),
+m_gpuSmallSortData(ctx,q),
+m_gpuSmallSortedAabbs(ctx,q),
+m_currentBuffer(-1)
+{
+	const char* sapSrc = sapCL;
+    const char* sapFastSrc = sapFastCL;
+    
+	cl_int errNum=0;
+
+	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
+	b3Assert(errNum==CL_SUCCESS);
+	cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
+	b3Assert(errNum==CL_SUCCESS);
+
+	
+	//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
+	//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelBarrier",&errNum,sapProg );
+	//m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+
+	
+	m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelTwoArrays",&errNum,sapProg );
+	b3Assert(errNum==CL_SUCCESS);
+
+#if 0
+
+	m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelOriginal",&errNum,sapProg );
+	b3Assert(errNum==CL_SUCCESS);
+#else
+#ifndef __APPLE__
+	m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsKernel",&errNum,sapFastProg );
+	b3Assert(errNum==CL_SUCCESS);
+#else
+	m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "computePairsKernelLocalSharedMemory",&errNum,sapProg );
+	b3Assert(errNum==CL_SUCCESS);
+#endif
+#endif
+
+	m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "flipFloatKernel",&errNum,sapProg );
+
+	m_copyAabbsKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "copyAabbsKernel",&errNum,sapProg );
+
+	m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "scatterKernel",&errNum,sapProg );
+
+	m_sorter = new b3RadixSort32CL(m_context,m_device,m_queue);
+}
+
+b3GpuSapBroadphase::~b3GpuSapBroadphase()
+{
+	delete m_sorter;
+	clReleaseKernel(m_scatterKernel);
+	clReleaseKernel(m_flipFloatKernel);
+	clReleaseKernel(m_copyAabbsKernel);
+	clReleaseKernel(m_sapKernel);
+	clReleaseKernel(m_sap2Kernel);
+
+}
+
+/// conservative test for overlap between two aabbs
+static bool TestAabbAgainstAabb2(const b3Vector3 &aabbMin1, const b3Vector3 &aabbMax1,
+								const b3Vector3 &aabbMin2, const b3Vector3 &aabbMax2)
+{
+	bool overlap = true;
+	overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
+	overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
+	overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
+	return overlap;
+}
+
+
+
+//http://stereopsis.com/radix.html
+static unsigned int FloatFlip(float fl)
+{
+	unsigned int f = *(unsigned int*)&fl;
+	unsigned int mask = -(int)(f >> 31) | 0x80000000;
+	return f ^ mask;
+};
+
+void  b3GpuSapBroadphase::init3dSap()
+{
+	if (m_currentBuffer<0)
+	{
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+
+		m_currentBuffer = 0;
+		for (int axis=0;axis<3;axis++)
+		{
+			for (int buf=0;buf<2;buf++)
+			{
+				int totalNumAabbs = m_allAabbsCPU.size();
+				m_sortedAxisCPU[axis][buf].resize(totalNumAabbs);
+
+				if (buf==m_currentBuffer)
+				{
+					for (int i=0;i<totalNumAabbs;i++)
+					{
+						m_sortedAxisCPU[axis][buf][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
+						m_sortedAxisCPU[axis][buf][i].m_value = i;
+					}
+				}
+			}
+		}
+	}
+}
+void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
+{
+	b3Assert(m_currentBuffer>=0);
+	if (m_currentBuffer<0)
+		return;
+	
+	m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+
+	for (int axis=0;axis<3;axis++)
+	{
+		for (int buf=0;buf<2;buf++)
+		{
+			b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size());
+		}
+	}
+
+
+	m_currentBuffer = 1-m_currentBuffer;
+	
+	for (int axis=0;axis<3;axis++)
+	{
+		int totalNumAabbs = m_allAabbsCPU.size();
+		for (int i=0;i<totalNumAabbs;i++)
+		{
+			m_sortedAxisCPU[axis][m_currentBuffer][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
+			m_sortedAxisCPU[axis][m_currentBuffer][i].m_value = i;
+		}
+	}
+
+	
+}
+
+void  b3GpuSapBroadphase::calculateOverlappingPairsHost()
+{
+	//test
+	//if (m_currentBuffer>=0)
+	//	calculateOverlappingPairsHostIncremental3Sap();
+
+	int axis=0;
+
+	b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
+	
+
+	
+	m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+	
+	{
+		int numSmallAabbs = m_smallAabbsCPU.size();
+		for (int j=0;j<numSmallAabbs;j++)
+		{
+			//sync aabb
+			int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
+			m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
+			m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+		}
+	}
+
+	{
+		int numLargeAabbs = m_largeAabbsCPU.size();
+		for (int j=0;j<numLargeAabbs;j++)
+		{
+			//sync aabb
+			int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
+			m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
+			m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+
+		}
+	}
+
+	b3AlignedObjectArray<b3Int2> hostPairs;
+
+	{
+		int numSmallAabbs = m_smallAabbsCPU.size();
+		for (int i=0;i<numSmallAabbs;i++)
+		{
+			float reference = m_smallAabbsCPU[i].m_max[axis];
+
+			for (int j=i+1;j<numSmallAabbs;j++)
+			{
+				if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
+					(b3Vector3&)m_smallAabbsCPU[j].m_min,(b3Vector3&)m_smallAabbsCPU[j].m_max))
+				{
+					b3Int2 pair;
+					pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
+					pair.y = m_smallAabbsCPU[j].m_minIndices[3];
+					hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+
+	
+	{
+		int numSmallAabbs = m_smallAabbsCPU.size();
+		for (int i=0;i<numSmallAabbs;i++)
+		{
+			float reference = m_smallAabbsCPU[i].m_max[axis];
+			int numLargeAabbs = m_largeAabbsCPU.size();
+
+			for (int j=0;j<numLargeAabbs;j++)
+			{
+				if (TestAabbAgainstAabb2((b3Vector3&)m_smallAabbsCPU[i].m_min, (b3Vector3&)m_smallAabbsCPU[i].m_max,
+					(b3Vector3&)m_largeAabbsCPU[j].m_min,(b3Vector3&)m_largeAabbsCPU[j].m_max))
+				{
+					b3Int2 pair;
+					pair.x = m_largeAabbsCPU[j].m_minIndices[3];
+					pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
+					hostPairs.push_back(pair);
+				}
+			}
+		}
+	}
+
+
+	if (hostPairs.size())
+	{
+		m_overlappingPairs.copyFromHost(hostPairs);
+	} else
+	{
+		m_overlappingPairs.resize(0);
+	}
+
+	//init3dSap();
+
+}
+
+void  b3GpuSapBroadphase::calculateOverlappingPairs()
+{
+	int axis = 0;//todo on GPU for now hardcode
+
+
+
+	{
+
+	bool syncOnHost = false;
+
+	if (syncOnHost)
+	{
+		B3_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
+		
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+
+		m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
+		{
+			int numSmallAabbs = m_smallAabbsCPU.size();
+			for (int j=0;j<numSmallAabbs;j++)
+			{
+				//sync aabb
+				int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
+				m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
+				m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+			}
+		}
+		m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
+	
+	} else
+	{
+		{
+			int numSmallAabbs = m_smallAabbsGPU.size();
+			if (numSmallAabbs)
+			{
+				B3_PROFILE("copyAabbsKernelSmall");
+				b3BufferInfoCL bInfo[] = { 
+					b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), 
+					b3BufferInfoCL( m_smallAabbsGPU.getBufferCL()),
+				};
+
+				b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( numSmallAabbs  );
+				int num = numSmallAabbs;
+				launcher.launch1D( num);
+				clFinish(m_queue);
+			}
+		}
+	}
+
+	if (syncOnHost)
+	{
+		B3_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
+		
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+
+		m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
+		{
+			int numLargeAabbs = m_largeAabbsCPU.size();
+			for (int j=0;j<numLargeAabbs;j++)
+			{
+				//sync aabb
+				int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
+				m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
+				m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
+			}
+		}
+		m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
+	
+	} else
+	{
+		int numLargeAabbs = m_largeAabbsGPU.size();
+		
+		if (numLargeAabbs)
+		{
+			B3_PROFILE("copyAabbsKernelLarge");
+			b3BufferInfoCL bInfo[] = { 
+				b3BufferInfoCL( m_allAabbsGPU.getBufferCL(), true ), 
+				b3BufferInfoCL( m_largeAabbsGPU.getBufferCL()),
+			};
+
+			b3LauncherCL launcher(m_queue, m_copyAabbsKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( numLargeAabbs  );
+			int num = numLargeAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		}
+	}
+
+
+
+
+		B3_PROFILE("GPU SAP");
+		
+		int numSmallAabbs = m_smallAabbsGPU.size();
+		m_gpuSmallSortData.resize(numSmallAabbs);
+		int numLargeAabbs = m_smallAabbsGPU.size();
+
+#if 1
+		if (m_smallAabbsGPU.size())
+		{
+			B3_PROFILE("flipFloatKernel");
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL())};
+			b3LauncherCL launcher(m_queue, m_flipFloatKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( numSmallAabbs  );
+			launcher.setConst( axis  );
+			
+			int num = numSmallAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+		}
+
+		{
+			B3_PROFILE("gpu radix sort\n");
+			m_sorter->execute(m_gpuSmallSortData);
+			clFinish(m_queue);
+		}
+
+		m_gpuSmallSortedAabbs.resize(numSmallAabbs);
+		if (numSmallAabbs)
+		{
+			B3_PROFILE("scatterKernel");
+			b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_smallAabbsGPU.getBufferCL(), true ), b3BufferInfoCL( m_gpuSmallSortData.getBufferCL(),true),b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
+			b3LauncherCL launcher(m_queue, m_scatterKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+			launcher.setConst( numSmallAabbs);
+			int num = numSmallAabbs;
+			launcher.launch1D( num);
+			clFinish(m_queue);
+			
+		}
+        
+
+			int maxPairsPerBody = 64;
+			int maxPairs = maxPairsPerBody * numSmallAabbs;//todo
+			m_overlappingPairs.resize(maxPairs);
+
+			b3OpenCLArray<int> pairCount(m_context, m_queue);
+			pairCount.push_back(0);
+            int numPairs=0;
+
+			{
+				int numLargeAabbs = m_largeAabbsGPU.size();
+				if (numLargeAabbs && numSmallAabbs)
+				{
+					B3_PROFILE("sap2Kernel");
+					b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_largeAabbsGPU.getBufferCL() ),b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
+					b3LauncherCL launcher(m_queue, m_sap2Kernel);
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+					launcher.setConst(   numLargeAabbs  );
+					launcher.setConst( numSmallAabbs);
+					launcher.setConst( axis  );
+					launcher.setConst( maxPairs  );
+//@todo: use actual maximum work item sizes of the device instead of hardcoded values
+					launcher.launch2D( numLargeAabbs, numSmallAabbs,4,64);
+                
+					numPairs = pairCount.at(0);
+					if (numPairs >maxPairs)
+						numPairs =maxPairs;
+					
+				}
+			}
+			if (m_gpuSmallSortedAabbs.size())
+			{
+				B3_PROFILE("sapKernel");
+				b3BufferInfoCL bInfo[] = { b3BufferInfoCL( m_gpuSmallSortedAabbs.getBufferCL() ), b3BufferInfoCL( m_overlappingPairs.getBufferCL() ), b3BufferInfoCL(pairCount.getBufferCL())};
+				b3LauncherCL launcher(m_queue, m_sapKernel);
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
+				launcher.setConst( numSmallAabbs  );
+				launcher.setConst( axis  );
+				launcher.setConst( maxPairs  );
+
+			
+				int num = numSmallAabbs;
+#if 0                
+                int buffSize = launcher.getSerializationBufferSize();
+                unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
+                for (int i=0;i<buffSize+1;i++)
+                {
+                    unsigned char* ptr = (unsigned char*)&buf[i];
+                    *ptr = 0xff;
+                }
+                int actualWrite = launcher.serializeArguments(buf,buffSize);
+                
+                unsigned char* cptr = (unsigned char*)&buf[buffSize];
+    //            printf("buf[buffSize] = %d\n",*cptr);
+                
+                assert(buf[buffSize]==0xff);//check for buffer overrun
+                int* ptr = (int*)&buf[buffSize];
+                
+                *ptr = num;
+                
+                FILE* f = fopen("m_sapKernelArgs.bin","wb");
+                fwrite(buf,buffSize+sizeof(int),1,f);
+                fclose(f);
+#endif//
+
+                launcher.launch1D( num);
+				clFinish(m_queue);
+                
+                numPairs = pairCount.at(0);
+                if (numPairs>maxPairs)
+					numPairs = maxPairs;
+			}
+			
+#else
+        int numPairs = 0;
+        
+        
+        b3LauncherCL launcher(m_queue, m_sapKernel);
+
+        const char* fileName = "m_sapKernelArgs.bin";
+        FILE* f = fopen(fileName,"rb");
+        if (f)
+        {
+            int sizeInBytes=0;
+            if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET)) 
+            {
+                printf("error, cannot get file size\n");
+                exit(0);
+            }
+            
+            unsigned char* buf = (unsigned char*) malloc(sizeInBytes);
+            fread(buf,sizeInBytes,1,f);
+            int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes,m_context);
+            int num = *(int*)&buf[serializedBytes];
+            launcher.launch1D( num);
+            
+            b3OpenCLArray<int> pairCount(m_context, m_queue);
+            int numElements = launcher.m_arrays[2]->size()/sizeof(int);
+            pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(),numElements);
+            numPairs = pairCount.at(0);
+            //printf("overlapping pairs = %d\n",numPairs);
+            b3AlignedObjectArray<b3Int2>		hostOoverlappingPairs;
+            b3OpenCLArray<b3Int2> tmpGpuPairs(m_context,m_queue);
+            tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(),numPairs );
+   
+            tmpGpuPairs.copyToHost(hostOoverlappingPairs);
+            m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
+            //printf("hello %d\n", m_overlappingPairs.size());
+            free(buf);
+            fclose(f);
+            
+        } else {
+            printf("error: cannot find file %s\n",fileName);
+        }
+        
+        clFinish(m_queue);
+
+        
+#endif
+
+			
+        m_overlappingPairs.resize(numPairs);
+		
+	}//B3_PROFILE("GPU_RADIX SORT");
+
+	
+}
+
+void b3GpuSapBroadphase::writeAabbsToGpu()
+{
+	m_allAabbsGPU.copyFromHost(m_allAabbsCPU);//might not be necessary, the 'setupGpuAabbsFull' already takes care of this
+	m_smallAabbsGPU.copyFromHost(m_smallAabbsCPU);
+	m_largeAabbsGPU.copyFromHost(m_largeAabbsCPU);
+
+}
+
+void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+{
+	int index = userPtr;
+	b3SapAabb aabb;
+	for (int i=0;i<4;i++)
+	{
+		aabb.m_min[i] = aabbMin[i];
+		aabb.m_max[i] = aabbMax[i];
+	}
+	aabb.m_minIndices[3] = index;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
+	m_largeAabbsCPU.push_back(aabb);
+	m_allAabbsCPU.push_back(aabb);
+}
+
+void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask)
+{
+	int index = userPtr;
+	b3SapAabb aabb;
+	for (int i=0;i<4;i++)
+	{
+		aabb.m_min[i] = aabbMin[i];
+		aabb.m_max[i] = aabbMax[i];
+	}
+	aabb.m_minIndices[3] = index;
+	aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
+	m_smallAabbsCPU.push_back(aabb);
+	m_allAabbsCPU.push_back(aabb);
+}
+
+cl_mem	b3GpuSapBroadphase::getAabbBufferWS()
+{
+	return m_allAabbsGPU.getBufferCL();
+}
+
+int	b3GpuSapBroadphase::getNumOverlap()
+{
+	return m_overlappingPairs.size();
+}
+cl_mem	b3GpuSapBroadphase::getOverlappingPairBuffer()
+{
+	return m_overlappingPairs.getBufferCL();
+}
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@@ -0,0 +1,69 @@
+#ifndef B3_GPU_SAP_BROADPHASE_H
+#define B3_GPU_SAP_BROADPHASE_H
+
+#include "Bullet3OpenCL/ParallelPrimitives/b3OpenCLArray.h"
+#include "Bullet3OpenCL/ParallelPrimitives/b3FillCL.h" //b3Int2
+class b3Vector3;
+#include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
+
+#include "b3SapAabb.h"
+
+
+
+class b3GpuSapBroadphase
+{
+	
+	cl_context				m_context;
+	cl_device_id			m_device;
+	cl_command_queue		m_queue;
+	cl_kernel				m_flipFloatKernel;
+	cl_kernel				m_scatterKernel ;
+	cl_kernel				m_copyAabbsKernel;
+	cl_kernel				m_sapKernel;
+	cl_kernel				m_sap2Kernel;
+
+	class b3RadixSort32CL* m_sorter;
+
+	///test for 3d SAP
+	b3AlignedObjectArray<b3SortData>		m_sortedAxisCPU[3][2];
+	int	m_currentBuffer;
+
+	public:
+	
+	b3OpenCLArray<b3SapAabb>	m_allAabbsGPU;
+	b3AlignedObjectArray<b3SapAabb>	m_allAabbsCPU;
+
+	b3OpenCLArray<b3SapAabb>	m_smallAabbsGPU;
+	b3AlignedObjectArray<b3SapAabb>	m_smallAabbsCPU;
+
+	b3OpenCLArray<b3SapAabb>	m_largeAabbsGPU;
+	b3AlignedObjectArray<b3SapAabb>	m_largeAabbsCPU;
+
+	b3OpenCLArray<b3Int2>		m_overlappingPairs;
+
+	//temporary gpu work memory
+	b3OpenCLArray<b3SortData>	m_gpuSmallSortData;
+	b3OpenCLArray<b3SapAabb>	m_gpuSmallSortedAabbs;
+
+
+	b3GpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q );
+	virtual ~b3GpuSapBroadphase();
+	
+	void  calculateOverlappingPairs();
+	void  calculateOverlappingPairsHost();
+
+	void init3dSap();
+	void calculateOverlappingPairsHostIncremental3Sap();
+
+	void createProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+	void createLargeProxy(const b3Vector3& aabbMin,  const b3Vector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
+
+	//call writeAabbsToGpu after done making all changes (createProxy etc)
+	void writeAabbsToGpu();
+
+	cl_mem	getAabbBufferWS();
+	int	getNumOverlap();
+	cl_mem	getOverlappingPairBuffer();
+};
+
+#endif //B3_GPU_SAP_BROADPHASE_H
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
@@ -0,0 +1,18 @@
+#ifndef B3_SAP_AABB_H
+#define B3_SAP_AABB_H
+
+struct b3SapAabb
+{
+	union
+	{
+		float m_min[4];
+		int m_minIndices[4];
+	};
+	union
+	{
+		float m_max[4];
+		int m_signedMaxIndices[4];
+	};
+};
+
+#endif //B3_SAP_AABB_H
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
@@ -0,0 +1,320 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+
+/// conservative test for overlap between two aabbs
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+
+__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	if (i>=numUnsortedAabbs)
+		return;
+
+	int j = get_global_id(1);
+	if (j>=numSortedAabbs)
+		return;
+
+	if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
+	{
+		int2 myPair;
+		
+		myPair.x = unsortedAabbs[i].m_minIndices[3];
+		myPair.y = sortedAabbs[j].m_minIndices[3];
+
+		int curPair = atomic_inc (pairCount);
+		if (curPair<maxPairs)
+		{
+				pairsOut[curPair] = myPair; //flush to main memory
+		}
+	}
+}
+
+__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	for (int j=i+1;j<numObjects;j++)
+	{
+  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+		{
+			break;
+		}
+		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+		{
+			int2 myPair;
+			myPair.x = aabbs[i].m_minIndices[3];
+			myPair.y = aabbs[j].m_minIndices[3];
+			int curPair = atomic_inc (pairCount);
+			if (curPair<maxPairs)
+			{
+					pairsOut[curPair] = myPair; //flush to main memory
+			}
+		}
+	}
+}
+
+
+
+
+__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+			{
+				int2 myPair;
+				myPair.x = aabbs[i].m_minIndices[3];
+				myPair.y = aabbs[j].m_minIndices[3];
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		j++;
+
+	} while (breakRequest[0]<numActiveWgItems[0]);
+}
+
+
+__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	__local btAabbCL localAabbs[128];// = aabbs[i];
+	
+	btAabbCL myAabb;
+	
+	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
+	float testValue = 	myAabb.m_maxElems[axis];
+	
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	int localCount=0;
+	int block=0;
+	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
+	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+	
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
+			{
+				int2 myPair;
+				myPair.x = myAabb.m_minIndices[3];
+				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		localCount++;
+		if (localCount==64)
+		{
+			localCount = 0;
+			block+=64;			
+			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
+			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
+		}
+		j++;
+		
+	} while (breakRequest[0]<numActiveWgItems[0]);
+	
+}
+
+
+
+
+//http://stereopsis.com/radix.html
+unsigned int FloatFlip(float fl);
+unsigned int FloatFlip(float fl)
+{
+	unsigned int f = *(unsigned int*)&fl;
+	unsigned int mask = -(int)(f >> 31) | 0x80000000;
+	return f ^ mask;
+}
+float IFloatFlip(unsigned int f);
+float IFloatFlip(unsigned int f)
+{
+	unsigned int mask = ((f >> 31) - 1) | 0x80000000;
+	unsigned int fl = f ^ mask;
+	return *(float*)&fl;
+}
+
+
+
+
+__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	int src = destAabbs[i].m_maxIndices[3];
+	destAabbs[i] = allAabbs[src];
+	destAabbs[i].m_maxIndices[3] = src;
+}
+
+
+__kernel void   flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+		
+		sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
+		sortData[i].y = i;
+		
+}
+
+
+__kernel void   scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+
+		sortedAabbs[i] = aabbs[sortData[i].y];
+}
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
@@ -0,0 +1,161 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+
+/// conservative test for overlap between two aabbs
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
+{
+//skip pairs between static (mass=0) objects
+	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
+		return false;
+		
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+
+//computePairsKernelBatchWrite
+__kernel void   computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	__local btAabbCL localAabbs[128];// = aabbs[i];
+	
+	int2 myPairs[64];
+	
+	btAabbCL myAabb;
+	
+	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
+	float testValue = 	myAabb.m_maxElems[axis];
+	
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	int localCount=0;
+	int block=0;
+	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
+	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+	int curNumPairs = 0;
+	
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
+			{
+				int2 myPair;
+				myPair.x = myAabb.m_minIndices[3];
+				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
+				myPairs[curNumPairs] = myPair;
+				curNumPairs++;
+				if (curNumPairs==64)
+				{
+					int curPair = atomic_add(pairCount,curNumPairs);
+					//avoid a buffer overrun
+					if ((curPair+curNumPairs)<maxPairs)
+					{
+						for (int p=0;p<curNumPairs;p++)
+						{
+							pairsOut[curPair+p] = myPairs[p]; //flush to main memory
+						}
+					}
+					curNumPairs = 0;
+				}
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		localCount++;
+		if (localCount==64)
+		{
+			localCount = 0;
+			block+=64;			
+			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
+			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
+		}
+		j++;
+		
+	} while (breakRequest[0]<numActiveWgItems[0]);
+	
+	
+	if (curNumPairs>0)
+	{
+		//avoid a buffer overrun
+		int curPair = atomic_add(pairCount,curNumPairs);
+		if ((curPair+curNumPairs)<maxPairs)
+		{
+			for (int p=0;p<curNumPairs;p++)
+			{
+					pairsOut[curPair+p] = myPairs[p]; //flush to main memory
+			}
+		}
+		curNumPairs = 0;
+	}
+}
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
@@ -0,0 +1,164 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* sapFastCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Erwin Coumans\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} b3AabbCL;\n"
+"\n"
+"\n"
+"/// conservative test for overlap between two aabbs\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
+"{\n"
+"//skip pairs between static (mass=0) objects\n"
+"	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
+"		return false;\n"
+"		\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"\n"
+"//computePairsKernelBatchWrite\n"
+"__kernel void   computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	__local b3AabbCL localAabbs[128];// = aabbs[i];\n"
+"	\n"
+"	int2 myPairs[64];\n"
+"	\n"
+"	b3AabbCL myAabb;\n"
+"	\n"
+"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+"	float testValue = 	myAabb.m_maxElems[axis];\n"
+"	\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	int localCount=0;\n"
+"	int block=0;\n"
+"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+"	\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	int curNumPairs = 0;\n"
+"	\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = myAabb.m_minIndices[3];\n"
+"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+"				myPairs[curNumPairs] = myPair;\n"
+"				curNumPairs++;\n"
+"				if (curNumPairs==64)\n"
+"				{\n"
+"					int curPair = atomic_add(pairCount,curNumPairs);\n"
+"					//avoid a buffer overrun\n"
+"					if ((curPair+curNumPairs)<maxPairs)\n"
+"					{\n"
+"						for (int p=0;p<curNumPairs;p++)\n"
+"						{\n"
+"							pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
+"						}\n"
+"					}\n"
+"					curNumPairs = 0;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		localCount++;\n"
+"		if (localCount==64)\n"
+"		{\n"
+"			localCount = 0;\n"
+"			block+=64;			\n"
+"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+"		}\n"
+"		j++;\n"
+"		\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"	\n"
+"	\n"
+"	if (curNumPairs>0)\n"
+"	{\n"
+"		//avoid a buffer overrun\n"
+"		int curPair = atomic_add(pairCount,curNumPairs);\n"
+"		if ((curPair+curNumPairs)<maxPairs)\n"
+"		{\n"
+"			for (int p=0;p<curNumPairs;p++)\n"
+"			{\n"
+"					pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"		curNumPairs = 0;\n"
+"	}\n"
+"}\n"
+;
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@@ -0,0 +1,324 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* sapCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Erwin Coumans\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} b3AabbCL;\n"
+"\n"
+"\n"
+"/// conservative test for overlap between two aabbs\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numUnsortedAabbs)\n"
+"		return;\n"
+"\n"
+"	int j = get_global_id(1);\n"
+"	if (j>=numSortedAabbs)\n"
+"		return;\n"
+"\n"
+"	if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
+"	{\n"
+"		int2 myPair;\n"
+"		\n"
+"		myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
+"		myPair.y = sortedAabbs[j].m_minIndices[3];\n"
+"\n"
+"		int curPair = atomic_inc (pairCount);\n"
+"		if (curPair<maxPairs)\n"
+"		{\n"
+"				pairsOut[curPair] = myPair; //flush to main memory\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel void   computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	for (int j=i+1;j<numObjects;j++)\n"
+"	{\n"
+"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"		{\n"
+"			break;\n"
+"		}\n"
+"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"		{\n"
+"			int2 myPair;\n"
+"			myPair.x = aabbs[i].m_minIndices[3];\n"
+"			myPair.y = aabbs[j].m_minIndices[3];\n"
+"			int curPair = atomic_inc (pairCount);\n"
+"			if (curPair<maxPairs)\n"
+"			{\n"
+"					pairsOut[curPair] = myPair; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = aabbs[i].m_minIndices[3];\n"
+"				myPair.y = aabbs[j].m_minIndices[3];\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		j++;\n"
+"\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	__local b3AabbCL localAabbs[128];// = aabbs[i];\n"
+"	\n"
+"	b3AabbCL myAabb;\n"
+"	\n"
+"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+"	float testValue = 	myAabb.m_maxElems[axis];\n"
+"	\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	int localCount=0;\n"
+"	int block=0;\n"
+"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+"	\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = myAabb.m_minIndices[3];\n"
+"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+"		localCount++;\n"
+"		if (localCount==64)\n"
+"		{\n"
+"			localCount = 0;\n"
+"			block+=64;			\n"
+"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+"		}\n"
+"		j++;\n"
+"		\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"	\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"//http://stereopsis.com/radix.html\n"
+"unsigned int FloatFlip(float fl);\n"
+"unsigned int FloatFlip(float fl)\n"
+"{\n"
+"	unsigned int f = *(unsigned int*)&fl;\n"
+"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
+"	return f ^ mask;\n"
+"}\n"
+"float IFloatFlip(unsigned int f);\n"
+"float IFloatFlip(unsigned int f)\n"
+"{\n"
+"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
+"	unsigned int fl = f ^ mask;\n"
+"	return *(float*)&fl;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	int src = destAabbs[i].m_maxIndices[3];\n"
+"	destAabbs[i] = allAabbs[src];\n"
+"	destAabbs[i].m_maxIndices[3] = src;\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"		\n"
+"		sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
+"		sortData[i].y = i;\n"
+"		\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"\n"
+"		sortedAabbs[i] = aabbs[sortData[i].y];\n"
+"}\n"
+"\n"
+;