reorder files, in preparation for Bullet 3 -> Bullet 2 merge

2013-04-29 19:04:08 -07:00
parent 55b69201a9
commit 3ac332f3a7
162 changed files with 215 additions and 3070 deletions
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl
@@ -0,0 +1,320 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+
+/// conservative test for overlap between two aabbs
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2GlobalGlobal(__global const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2Global(const btAabbCL* aabb1, __global const btAabbCL* aabb2)
+{
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+
+__kernel void   computePairsKernelTwoArrays( __global const btAabbCL* unsortedAabbs, __global const btAabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	if (i>=numUnsortedAabbs)
+		return;
+
+	int j = get_global_id(1);
+	if (j>=numSortedAabbs)
+		return;
+
+	if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))
+	{
+		int2 myPair;
+		
+		myPair.x = unsortedAabbs[i].m_minIndices[3];
+		myPair.y = sortedAabbs[j].m_minIndices[3];
+
+		int curPair = atomic_inc (pairCount);
+		if (curPair<maxPairs)
+		{
+				pairsOut[curPair] = myPair; //flush to main memory
+		}
+	}
+}
+
+__kernel void   computePairsKernelOriginal( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	for (int j=i+1;j<numObjects;j++)
+	{
+  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+		{
+			break;
+		}
+		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+		{
+			int2 myPair;
+			myPair.x = aabbs[i].m_minIndices[3];
+			myPair.y = aabbs[j].m_minIndices[3];
+			int curPair = atomic_inc (pairCount);
+			if (curPair<maxPairs)
+			{
+					pairsOut[curPair] = myPair; //flush to main memory
+			}
+		}
+	}
+}
+
+
+
+
+__kernel void   computePairsKernelBarrier( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))
+			{
+				int2 myPair;
+				myPair.x = aabbs[i].m_minIndices[3];
+				myPair.y = aabbs[j].m_minIndices[3];
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		j++;
+
+	} while (breakRequest[0]<numActiveWgItems[0]);
+}
+
+
+__kernel void   computePairsKernelLocalSharedMemory( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	__local btAabbCL localAabbs[128];// = aabbs[i];
+	
+	btAabbCL myAabb;
+	
+	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
+	float testValue = 	myAabb.m_maxElems[axis];
+	
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	int localCount=0;
+	int block=0;
+	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
+	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+	
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
+			{
+				int2 myPair;
+				myPair.x = myAabb.m_minIndices[3];
+				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
+				int curPair = atomic_inc (pairCount);
+				if (curPair<maxPairs)
+				{
+						pairsOut[curPair] = myPair; //flush to main memory
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		localCount++;
+		if (localCount==64)
+		{
+			localCount = 0;
+			block+=64;			
+			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
+			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
+		}
+		j++;
+		
+	} while (breakRequest[0]<numActiveWgItems[0]);
+	
+}
+
+
+
+
+//http://stereopsis.com/radix.html
+unsigned int FloatFlip(float fl);
+unsigned int FloatFlip(float fl)
+{
+	unsigned int f = *(unsigned int*)&fl;
+	unsigned int mask = -(int)(f >> 31) | 0x80000000;
+	return f ^ mask;
+}
+float IFloatFlip(unsigned int f);
+float IFloatFlip(unsigned int f)
+{
+	unsigned int mask = ((f >> 31) - 1) | 0x80000000;
+	unsigned int fl = f ^ mask;
+	return *(float*)&fl;
+}
+
+
+
+
+__kernel void   copyAabbsKernel( __global const btAabbCL* allAabbs, __global btAabbCL* destAabbs, int numObjects)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+	int src = destAabbs[i].m_maxIndices[3];
+	destAabbs[i] = allAabbs[src];
+	destAabbs[i].m_maxIndices[3] = src;
+}
+
+
+__kernel void   flipFloatKernel( __global const btAabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+		
+		sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);
+		sortData[i].y = i;
+		
+}
+
+
+__kernel void   scatterKernel( __global const btAabbCL* aabbs, volatile __global const int2* sortData, __global btAabbCL* sortedAabbs, int numObjects)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+
+		sortedAabbs[i] = aabbs[sortData[i].y];
+}
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
@@ -0,0 +1,161 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+typedef struct 
+{
+	union
+	{
+		float4	m_min;
+		float   m_minElems[4];
+		int			m_minIndices[4];
+	};
+	union
+	{
+		float4	m_max;
+		float   m_maxElems[4];
+		int			m_maxIndices[4];
+	};
+} btAabbCL;
+
+
+/// conservative test for overlap between two aabbs
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
+bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
+{
+//skip pairs between static (mass=0) objects
+	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))
+		return false;
+		
+	bool overlap = true;
+	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;
+	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;
+	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;
+	return overlap;
+}
+
+
+//computePairsKernelBatchWrite
+__kernel void   computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
+{
+	int i = get_global_id(0);
+	int localId = get_local_id(0);
+
+	__local int numActiveWgItems[1];
+	__local int breakRequest[1];
+	__local btAabbCL localAabbs[128];// = aabbs[i];
+	
+	int2 myPairs[64];
+	
+	btAabbCL myAabb;
+	
+	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];
+	float testValue = 	myAabb.m_maxElems[axis];
+	
+	if (localId==0)
+	{
+		numActiveWgItems[0] = 0;
+		breakRequest[0] = 0;
+	}
+	int localCount=0;
+	int block=0;
+	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];
+	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];
+	
+	barrier(CLK_LOCAL_MEM_FENCE);
+	atomic_inc(numActiveWgItems);
+	barrier(CLK_LOCAL_MEM_FENCE);
+	int localBreak = 0;
+	int curNumPairs = 0;
+	
+	int j=i+1;
+	do
+	{
+		barrier(CLK_LOCAL_MEM_FENCE);
+	
+		if (j<numObjects)
+		{
+	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) 
+			{
+				if (!localBreak)
+				{
+					atomic_inc(breakRequest);
+					localBreak = 1;
+				}
+			}
+		}
+		
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (j>=numObjects && !localBreak)
+		{
+			atomic_inc(breakRequest);
+			localBreak = 1;
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		if (!localBreak)
+		{
+			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))
+			{
+				int2 myPair;
+				myPair.x = myAabb.m_minIndices[3];
+				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];
+				myPairs[curNumPairs] = myPair;
+				curNumPairs++;
+				if (curNumPairs==64)
+				{
+					int curPair = atomic_add(pairCount,curNumPairs);
+					//avoid a buffer overrun
+					if ((curPair+curNumPairs)<maxPairs)
+					{
+						for (int p=0;p<curNumPairs;p++)
+						{
+							pairsOut[curPair+p] = myPairs[p]; //flush to main memory
+						}
+					}
+					curNumPairs = 0;
+				}
+			}
+		}
+		barrier(CLK_LOCAL_MEM_FENCE);
+		
+		localCount++;
+		if (localCount==64)
+		{
+			localCount = 0;
+			block+=64;			
+			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];
+			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];
+		}
+		j++;
+		
+	} while (breakRequest[0]<numActiveWgItems[0]);
+	
+	
+	if (curNumPairs>0)
+	{
+		//avoid a buffer overrun
+		int curPair = atomic_add(pairCount,curNumPairs);
+		if ((curPair+curNumPairs)<maxPairs)
+		{
+			for (int p=0;p<curNumPairs;p++)
+			{
+					pairsOut[curPair+p] = myPairs[p]; //flush to main memory
+			}
+		}
+		curNumPairs = 0;
+	}
+}
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFastKernels.h
@@ -0,0 +1,164 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* sapFastCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Erwin Coumans\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} b3AabbCL;\n"
+"\n"
+"\n"
+"/// conservative test for overlap between two aabbs\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
+"{\n"
+"//skip pairs between static (mass=0) objects\n"
+"	if ((aabb1->m_maxIndices[3]==0) && (aabb2->m_maxIndices[3] == 0))\n"
+"		return false;\n"
+"		\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"\n"
+"//computePairsKernelBatchWrite\n"
+"__kernel void   computePairsKernel( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	__local b3AabbCL localAabbs[128];// = aabbs[i];\n"
+"	\n"
+"	int2 myPairs[64];\n"
+"	\n"
+"	b3AabbCL myAabb;\n"
+"	\n"
+"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+"	float testValue = 	myAabb.m_maxElems[axis];\n"
+"	\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	int localCount=0;\n"
+"	int block=0;\n"
+"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+"	\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	int curNumPairs = 0;\n"
+"	\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = myAabb.m_minIndices[3];\n"
+"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+"				myPairs[curNumPairs] = myPair;\n"
+"				curNumPairs++;\n"
+"				if (curNumPairs==64)\n"
+"				{\n"
+"					int curPair = atomic_add(pairCount,curNumPairs);\n"
+"					//avoid a buffer overrun\n"
+"					if ((curPair+curNumPairs)<maxPairs)\n"
+"					{\n"
+"						for (int p=0;p<curNumPairs;p++)\n"
+"						{\n"
+"							pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
+"						}\n"
+"					}\n"
+"					curNumPairs = 0;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		localCount++;\n"
+"		if (localCount==64)\n"
+"		{\n"
+"			localCount = 0;\n"
+"			block+=64;			\n"
+"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+"		}\n"
+"		j++;\n"
+"		\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"	\n"
+"	\n"
+"	if (curNumPairs>0)\n"
+"	{\n"
+"		//avoid a buffer overrun\n"
+"		int curPair = atomic_add(pairCount,curNumPairs);\n"
+"		if ((curPair+curNumPairs)<maxPairs)\n"
+"		{\n"
+"			for (int p=0;p<curNumPairs;p++)\n"
+"			{\n"
+"					pairsOut[curPair+p] = myPairs[p]; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"		curNumPairs = 0;\n"
+"	}\n"
+"}\n"
+;
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapKernels.h
@@ -0,0 +1,324 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* sapCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Erwin Coumans\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	union\n"
+"	{\n"
+"		float4	m_min;\n"
+"		float   m_minElems[4];\n"
+"		int			m_minIndices[4];\n"
+"	};\n"
+"	union\n"
+"	{\n"
+"		float4	m_max;\n"
+"		float   m_maxElems[4];\n"
+"		int			m_maxIndices[4];\n"
+"	};\n"
+"} b3AabbCL;\n"
+"\n"
+"\n"
+"/// conservative test for overlap between two aabbs\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2(const b3AabbCL* aabb1, __local const b3AabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2GlobalGlobal(__global const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2);\n"
+"bool TestAabbAgainstAabb2Global(const b3AabbCL* aabb1, __global const b3AabbCL* aabb2)\n"
+"{\n"
+"	bool overlap = true;\n"
+"	overlap = (aabb1->m_min.x > aabb2->m_max.x || aabb1->m_max.x < aabb2->m_min.x) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.z > aabb2->m_max.z || aabb1->m_max.z < aabb2->m_min.z) ? false : overlap;\n"
+"	overlap = (aabb1->m_min.y > aabb2->m_max.y || aabb1->m_max.y < aabb2->m_min.y) ? false : overlap;\n"
+"	return overlap;\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelTwoArrays( __global const b3AabbCL* unsortedAabbs, __global const b3AabbCL* sortedAabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numUnsortedAabbs, int numSortedAabbs, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numUnsortedAabbs)\n"
+"		return;\n"
+"\n"
+"	int j = get_global_id(1);\n"
+"	if (j>=numSortedAabbs)\n"
+"		return;\n"
+"\n"
+"	if (TestAabbAgainstAabb2GlobalGlobal(&unsortedAabbs[i],&sortedAabbs[j]))\n"
+"	{\n"
+"		int2 myPair;\n"
+"		\n"
+"		myPair.x = unsortedAabbs[i].m_minIndices[3];\n"
+"		myPair.y = sortedAabbs[j].m_minIndices[3];\n"
+"\n"
+"		int curPair = atomic_inc (pairCount);\n"
+"		if (curPair<maxPairs)\n"
+"		{\n"
+"				pairsOut[curPair] = myPair; //flush to main memory\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel void   computePairsKernelOriginal( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	for (int j=i+1;j<numObjects;j++)\n"
+"	{\n"
+"  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"		{\n"
+"			break;\n"
+"		}\n"
+"		if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"		{\n"
+"			int2 myPair;\n"
+"			myPair.x = aabbs[i].m_minIndices[3];\n"
+"			myPair.y = aabbs[j].m_minIndices[3];\n"
+"			int curPair = atomic_inc (pairCount);\n"
+"			if (curPair<maxPairs)\n"
+"			{\n"
+"					pairsOut[curPair] = myPair; //flush to main memory\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelBarrier( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(aabbs[i].m_maxElems[axis] < (aabbs[j].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2GlobalGlobal(&aabbs[i],&aabbs[j]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = aabbs[i].m_minIndices[3];\n"
+"				myPair.y = aabbs[j].m_minIndices[3];\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		j++;\n"
+"\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   computePairsKernelLocalSharedMemory( __global const b3AabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	int localId = get_local_id(0);\n"
+"\n"
+"	__local int numActiveWgItems[1];\n"
+"	__local int breakRequest[1];\n"
+"	__local b3AabbCL localAabbs[128];// = aabbs[i];\n"
+"	\n"
+"	b3AabbCL myAabb;\n"
+"	\n"
+"	myAabb = (i<numObjects)? aabbs[i]:aabbs[0];\n"
+"	float testValue = 	myAabb.m_maxElems[axis];\n"
+"	\n"
+"	if (localId==0)\n"
+"	{\n"
+"		numActiveWgItems[0] = 0;\n"
+"		breakRequest[0] = 0;\n"
+"	}\n"
+"	int localCount=0;\n"
+"	int block=0;\n"
+"	localAabbs[localId] = (i+block)<numObjects? aabbs[i+block] : aabbs[0];\n"
+"	localAabbs[localId+64] = (i+block+64)<numObjects? aabbs[i+block+64]: aabbs[0];\n"
+"	\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	atomic_inc(numActiveWgItems);\n"
+"	barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	int localBreak = 0;\n"
+"	\n"
+"	int j=i+1;\n"
+"	do\n"
+"	{\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"	\n"
+"		if (j<numObjects)\n"
+"		{\n"
+"	  	if(testValue < (localAabbs[localCount+localId+1].m_minElems[axis])) \n"
+"			{\n"
+"				if (!localBreak)\n"
+"				{\n"
+"					atomic_inc(breakRequest);\n"
+"					localBreak = 1;\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (j>=numObjects && !localBreak)\n"
+"		{\n"
+"			atomic_inc(breakRequest);\n"
+"			localBreak = 1;\n"
+"		}\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"		\n"
+"		if (!localBreak)\n"
+"		{\n"
+"			if (TestAabbAgainstAabb2(&myAabb,&localAabbs[localCount+localId+1]))\n"
+"			{\n"
+"				int2 myPair;\n"
+"				myPair.x = myAabb.m_minIndices[3];\n"
+"				myPair.y = localAabbs[localCount+localId+1].m_minIndices[3];\n"
+"				int curPair = atomic_inc (pairCount);\n"
+"				if (curPair<maxPairs)\n"
+"				{\n"
+"						pairsOut[curPair] = myPair; //flush to main memory\n"
+"				}\n"
+"			}\n"
+"		}\n"
+"		\n"
+"		barrier(CLK_LOCAL_MEM_FENCE);\n"
+"\n"
+"		localCount++;\n"
+"		if (localCount==64)\n"
+"		{\n"
+"			localCount = 0;\n"
+"			block+=64;			\n"
+"			localAabbs[localId] = ((i+block)<numObjects) ? aabbs[i+block] : aabbs[0];\n"
+"			localAabbs[localId+64] = ((i+64+block)<numObjects) ? aabbs[i+block+64] : aabbs[0];\n"
+"		}\n"
+"		j++;\n"
+"		\n"
+"	} while (breakRequest[0]<numActiveWgItems[0]);\n"
+"	\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"//http://stereopsis.com/radix.html\n"
+"unsigned int FloatFlip(float fl);\n"
+"unsigned int FloatFlip(float fl)\n"
+"{\n"
+"	unsigned int f = *(unsigned int*)&fl;\n"
+"	unsigned int mask = -(int)(f >> 31) | 0x80000000;\n"
+"	return f ^ mask;\n"
+"}\n"
+"float IFloatFlip(unsigned int f);\n"
+"float IFloatFlip(unsigned int f)\n"
+"{\n"
+"	unsigned int mask = ((f >> 31) - 1) | 0x80000000;\n"
+"	unsigned int fl = f ^ mask;\n"
+"	return *(float*)&fl;\n"
+"}\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"__kernel void   copyAabbsKernel( __global const b3AabbCL* allAabbs, __global b3AabbCL* destAabbs, int numObjects)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"	int src = destAabbs[i].m_maxIndices[3];\n"
+"	destAabbs[i] = allAabbs[src];\n"
+"	destAabbs[i].m_maxIndices[3] = src;\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   flipFloatKernel( __global const b3AabbCL* aabbs, volatile __global int2* sortData, int numObjects, int axis)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"		\n"
+"		sortData[i].x = FloatFlip(aabbs[i].m_minElems[axis]);\n"
+"		sortData[i].y = i;\n"
+"		\n"
+"}\n"
+"\n"
+"\n"
+"__kernel void   scatterKernel( __global const b3AabbCL* aabbs, volatile __global const int2* sortData, __global b3AabbCL* sortedAabbs, int numObjects)\n"
+"{\n"
+"	int i = get_global_id(0);\n"
+"	if (i>=numObjects)\n"
+"		return;\n"
+"\n"
+"		sortedAabbs[i] = aabbs[sortData[i].y];\n"
+"}\n"
+"\n"
+;