From 5991eef7490802b31cb1c3ef2dc2eef621896de8 Mon Sep 17 00:00:00 2001
From: erwin coumans <erwin.coumans@gmail.com>
Date: Sat, 20 Jul 2013 21:16:24 -0700
Subject: [PATCH] add GPU incremental 3d sap (not enabled by default)

---
 .../b3GpuSapBroadphase.cpp                    | 741 +++++++++++++++++-
 .../BroadphaseCollision/b3GpuSapBroadphase.h  |  24 +-
 .../BroadphaseCollision/b3SapAabb.h           |   4 +-
 .../BroadphaseCollision/kernels/sapFast.cl    | 270 +++++++
 .../ParallelPrimitives/b3RadixSort32CL.h      |   9 +-
 5 files changed, 1017 insertions(+), 31 deletions(-)

diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
index 3dc115204..bdf9359c7 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.cpp
@@ -1,4 +1,6 @@
 
+bool searchIncremental3dSapOnGpu = true;
+
 #include "b3GpuSapBroadphase.h"
 #include "Bullet3Common/b3Vector3.h"
 #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
@@ -27,7 +29,23 @@ m_gpuSmallSortedAabbs(ctx,q),
 m_sum(ctx,q),
 m_sum2(ctx,q),
 m_dst(ctx,q),
-m_currentBuffer(-1)
+m_currentBuffer(-1),
+m_objectMinMaxIndexGPUaxis0(ctx,q),
+m_objectMinMaxIndexGPUaxis1(ctx,q),
+m_objectMinMaxIndexGPUaxis2(ctx,q),
+m_objectMinMaxIndexGPUaxis0prev(ctx,q),
+m_objectMinMaxIndexGPUaxis1prev(ctx,q),
+m_objectMinMaxIndexGPUaxis2prev(ctx,q),
+m_sortedAxisGPU0(ctx,q),
+m_sortedAxisGPU1(ctx,q),
+m_sortedAxisGPU2(ctx,q),
+m_sortedAxisGPU0prev(ctx,q),
+m_sortedAxisGPU1prev(ctx,q),
+m_sortedAxisGPU2prev(ctx,q),
+m_addedHostPairsGPU(ctx,q),
+m_removedHostPairsGPU(ctx,q),
+m_addedCountGPU(ctx,q),
+m_removedCountGPU(ctx,q)
 {
 	const char* sapSrc = sapCL;
     const char* sapFastSrc = sapFastCL;
@@ -36,7 +54,8 @@ m_currentBuffer(-1)
 
 	cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapSrc,&errNum,"",B3_BROADPHASE_SAP_PATH);
 	b3Assert(errNum==CL_SUCCESS);
-	cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
+	//cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,sapFastSrc,&errNum,"",B3_BROADPHASE_SAPFAST_PATH);
+	cl_program sapFastProg = b3OpenCLUtils::compileCLProgramFromString(m_context,m_device,0,&errNum,"",B3_BROADPHASE_SAPFAST_PATH,true);
 	b3Assert(errNum==CL_SUCCESS);
 #ifndef __APPLE__
 	m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context,m_device,m_queue);
@@ -54,6 +73,9 @@ m_currentBuffer(-1)
 	m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapSrc, "prepareSumVarianceKernel",&errNum,sapProg );
 	b3Assert(errNum==CL_SUCCESS);
 
+	m_computePairsIncremental3dSapKernel= b3OpenCLUtils::compileCLKernelFromString(m_context, m_device,sapFastSrc, "computePairsIncremental3dSapKernel",&errNum,sapFastProg );
+	b3Assert(errNum==CL_SUCCESS);
+		
 	
 #if 0
 
@@ -89,6 +111,7 @@ b3GpuSapBroadphase::~b3GpuSapBroadphase()
 	clReleaseKernel(m_sapKernel);
 	clReleaseKernel(m_sap2Kernel);
 	clReleaseKernel(m_prepareSumVarianceKernel);
+	clReleaseKernel(m_computePairsIncremental3dSapKernel);
 
 }
 
@@ -125,48 +148,693 @@ void  b3GpuSapBroadphase::init3dSap()
 			for (int buf=0;buf<2;buf++)
 			{
 				int totalNumAabbs = m_allAabbsCPU.size();
-				m_sortedAxisCPU[axis][buf].resize(totalNumAabbs);
+				int numEndPoints = 2*totalNumAabbs;
+				m_sortedAxisCPU[axis][buf].resize(numEndPoints);
 
 				if (buf==m_currentBuffer)
 				{
 					for (int i=0;i<totalNumAabbs;i++)
 					{
-						m_sortedAxisCPU[axis][buf][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
-						m_sortedAxisCPU[axis][buf][i].m_value = i;
+						m_sortedAxisCPU[axis][buf][i*2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis])-1;
+						m_sortedAxisCPU[axis][buf][i*2].m_value = i*2;
+						m_sortedAxisCPU[axis][buf][i*2+1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis])+1;
+						m_sortedAxisCPU[axis][buf][i*2+1].m_value = i*2+1;
 					}
 				}
 			}
 		}
+
+		for (int axis=0;axis<3;axis++)
+		{
+			m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
+		}
+
+		for (int axis=0;axis<3;axis++)
+		{
+			int totalNumAabbs = m_allAabbsCPU.size();
+			int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
+			m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints);
+			for (int i=0;i<numEndPoints;i++)
+			{
+				int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
+				int newDest = destIndex/2;
+				if (destIndex&1)
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i;
+				} else
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i;
+				}
+			}
+		}
+
 	}
 }
+
+
+static bool b3PairCmp(const b3Int2& p, const b3Int2& q)
+{
+	return ((p.x<q.x) || ((p.x==q.x) && (p.y<q.y)));
+}
+
+
+static bool operator==(const b3Int2& a,const b3Int2& b)
+{
+	return a.x == b.x && a.y == b.y;
+};
+
+static bool operator<(const b3Int2& a,const b3Int2& b)
+{
+	return a.x < b.x || (a.x == b.x && a.y < b.y);
+};
+
+static bool operator>(const b3Int2& a,const b3Int2& b)
+{
+	return a.x > b.x || (a.x == b.x && a.y > b.y);
+};
+
+b3AlignedObjectArray<b3Int2> addedHostPairs;
+b3AlignedObjectArray<b3Int2> removedHostPairs;
+
+b3AlignedObjectArray<b3SapAabb>	preAabbs;
+
 void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
 {
-	b3Assert(m_currentBuffer>=0);
-	if (m_currentBuffer<0)
-		return;
+	static int framepje = 0;
+	//printf("framepje=%d\n",framepje++);
 	
-	m_allAabbsGPU.copyToHost(m_allAabbsCPU);
 
-	for (int axis=0;axis<3;axis++)
+	B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap");
+
+	addedHostPairs.resize(0);
+	removedHostPairs.resize(0);
+
+	b3Assert(m_currentBuffer>=0);
+	
 	{
-		for (int buf=0;buf<2;buf++)
+		preAabbs.resize(m_allAabbsCPU.size());
+		for (int i=0;i<preAabbs.size();i++)
 		{
-			b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size());
+			preAabbs[i]=m_allAabbsCPU[i];
 		}
 	}
+	
+
+	if (m_currentBuffer<0)
+		return;
+	{
+		B3_PROFILE("m_allAabbsGPU.copyToHost");
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+	}
+
+	b3AlignedObjectArray<b3Int2> allPairs;
+	{
+		B3_PROFILE("m_overlappingPairs.copyToHost");
+		m_overlappingPairs.copyToHost(allPairs);
+	}
+	if (0)
+	{
+	{
+		printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n",
+		m_allAabbsCPU[40].m_min[0],	m_allAabbsCPU[40].m_min[1],m_allAabbsCPU[40].m_min[2],
+		m_allAabbsCPU[40].m_max[0],	m_allAabbsCPU[40].m_max[1],m_allAabbsCPU[40].m_max[2]);
+	}
+
+	{
+		printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n",
+		m_allAabbsCPU[53].m_min[0],	m_allAabbsCPU[53].m_min[1],m_allAabbsCPU[53].m_min[2],
+		m_allAabbsCPU[53].m_max[0],	m_allAabbsCPU[53].m_max[1],m_allAabbsCPU[53].m_max[2]);
+	}
+	
+
+	{
+	b3Int2 newPair;
+	newPair.x = 40;
+	newPair.y = 53;
+		int index = allPairs.findBinarySearch(newPair);
+		printf("hasPair(40,53)=%d out of %d\n",index, allPairs.size());
+
+		{
+			int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max,(const b3Vector3&)m_allAabbsCPU[53].m_min,(const b3Vector3&)m_allAabbsCPU[53].m_max);
+			printf("overlap=%d\n",overlap);
+		}
+
+		if (preAabbs.size())
+		{
+			int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max,(const b3Vector3&)preAabbs[53].m_min,(const b3Vector3&)preAabbs[53].m_max);
+			printf("prevoverlap=%d\n",prevOverlap);
+		} else
+		{
+			printf("unknown prevoverlap\n");
+		}
+
+	}
+	}
+
+
+	if (0)
+	{
+		for (int i=0;i<m_allAabbsCPU.size();i++)
+		{
+			//printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2],			m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]);
+
+
+		}
+
+		for (int axis=0;axis<3;axis++)
+		{
+			for (int buf=0;buf<2;buf++)
+			{
+				b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size()*2);
+			}
+		}
+	}
+
 
 
 	m_currentBuffer = 1-m_currentBuffer;
 	
-	for (int axis=0;axis<3;axis++)
+
+
+	int totalNumAabbs = m_allAabbsCPU.size();
+
 	{
-		int totalNumAabbs = m_allAabbsCPU.size();
+		B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)");
 		for (int i=0;i<totalNumAabbs;i++)
 		{
-			m_sortedAxisCPU[axis][m_currentBuffer][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
-			m_sortedAxisCPU[axis][m_currentBuffer][i].m_value = i;
+		
+
+			unsigned int keyMin[3];
+			unsigned int keyMax[3];
+			for (int axis=0;axis<3;axis++)
+			{
+				float vmin=m_allAabbsCPU[i].m_min[axis];
+				float vmax = m_allAabbsCPU[i].m_max[axis];
+				keyMin[axis] = FloatFlip(vmin);
+				keyMax[axis] = FloatFlip(vmax);
+			
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_key = keyMin[axis]-1;
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2].m_value = i*2;
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_key = keyMax[axis]+1;
+				m_sortedAxisCPU[axis][m_currentBuffer][i*2+1].m_value = i*2+1;
+			}
+			//printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]);
+
 		}
 	}
+	
+	
+
+	{
+		B3_PROFILE("sort m_sortedAxisCPU");
+		for (int axis=0;axis<3;axis++)
+			m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
+	}
+
+	if (0)
+	{
+		for (int axis=0;axis<3;axis++)
+		{
+			//printf("axis %d\n",axis);
+			for (int i=0;i<m_sortedAxisCPU[axis][m_currentBuffer].size();i++)
+			{
+				int key = m_sortedAxisCPU[axis][m_currentBuffer][i].m_key;
+				int value = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
+				//printf("[%d]=%d\n",i,value);
+			}
+
+		}
+	}
+
+
+	{
+		B3_PROFILE("assign m_objectMinMaxIndexCPU");
+		for (int axis=0;axis<3;axis++)
+		{
+			int totalNumAabbs = m_allAabbsCPU.size();
+			int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
+			m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs);
+			for (int i=0;i<numEndPoints;i++)
+			{
+				int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
+				int newDest = destIndex/2;
+				if (destIndex&1)
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y=i;
+				} else
+				{
+					m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x=i;
+				}
+			}
+		}
+	}
+
+
+	if (0)
+	{	
+		printf("==========================\n");
+		for (int axis=0;axis<3;axis++)
+		{
+			unsigned int curMinIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].x;
+			unsigned int curMaxIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].y;
+			unsigned int prevMaxIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].y;
+			unsigned int prevMinIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].x;
+
+			int dmin40 = curMinIndex40 - prevMinIndex40;
+			int dmax40 = curMinIndex40 - prevMinIndex40;
+			printf("axis %d curMinIndex40=%d prevMinIndex40=%d\n",axis,curMinIndex40, prevMinIndex40);
+			printf("axis %d curMaxIndex40=%d prevMaxIndex40=%d\n",axis,curMaxIndex40, prevMaxIndex40);
+		}
+		printf(".........................\n");
+		for (int axis=0;axis<3;axis++)
+		{
+			unsigned int curMinIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].x;
+			unsigned int curMaxIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].y;
+			unsigned int prevMaxIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].y;
+			unsigned int prevMinIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].x;
+
+			int dmin40 = curMinIndex53 - prevMinIndex53;
+			int dmax40 = curMinIndex53 - prevMinIndex53;
+			printf("axis %d curMinIndex53=%d prevMinIndex53=%d\n",axis,curMinIndex53, prevMinIndex53);
+			printf("axis %d curMaxIndex53=%d prevMaxIndex53=%d\n",axis,curMaxIndex53, prevMaxIndex53);
+		}
+
+	}
+
+
+	int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
+	int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size();
+	int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size();
+	b3Assert(a==b);
+	b3Assert(b==c);
+	
+	if (searchIncremental3dSapOnGpu)
+	{
+		B3_PROFILE("computePairsIncremental3dSapKernelGPU");
+		int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
+		int maxCapacity = 1024*1024;
+		{
+			B3_PROFILE("copy from host");
+			m_objectMinMaxIndexGPUaxis0.copyFromHost(m_objectMinMaxIndexCPU[0][m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis1.copyFromHost(m_objectMinMaxIndexCPU[1][m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis2.copyFromHost(m_objectMinMaxIndexCPU[2][m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis0prev.copyFromHost(m_objectMinMaxIndexCPU[0][1-m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis1prev.copyFromHost(m_objectMinMaxIndexCPU[1][1-m_currentBuffer]);
+			m_objectMinMaxIndexGPUaxis2prev.copyFromHost(m_objectMinMaxIndexCPU[2][1-m_currentBuffer]);
+
+			m_sortedAxisGPU0.copyFromHost(m_sortedAxisCPU[0][m_currentBuffer]);
+			m_sortedAxisGPU1.copyFromHost(m_sortedAxisCPU[1][m_currentBuffer]);
+			m_sortedAxisGPU2.copyFromHost(m_sortedAxisCPU[2][m_currentBuffer]);
+			m_sortedAxisGPU0prev.copyFromHost(m_sortedAxisCPU[0][1-m_currentBuffer]);
+			m_sortedAxisGPU1prev.copyFromHost(m_sortedAxisCPU[1][1-m_currentBuffer]);
+			m_sortedAxisGPU2prev.copyFromHost(m_sortedAxisCPU[2][1-m_currentBuffer]);
+
+		
+			m_addedHostPairsGPU.resize(maxCapacity);
+			m_removedHostPairsGPU.resize(maxCapacity);
+
+			m_addedCountGPU.resize(0);
+			m_addedCountGPU.push_back(0);
+			m_removedCountGPU.resize(0);
+			m_removedCountGPU.push_back(0);
+		}
+
+		{
+			B3_PROFILE("launch1D");
+			b3LauncherCL launcher(m_queue,  m_computePairsIncremental3dSapKernel);
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis0.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis1.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis2.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis0prev.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis1prev.getBufferCL());
+			launcher.setBuffer(m_objectMinMaxIndexGPUaxis2prev.getBufferCL());
+
+			launcher.setBuffer(m_sortedAxisGPU0.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU1.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU2.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU0prev.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU1prev.getBufferCL());
+			launcher.setBuffer(m_sortedAxisGPU2prev.getBufferCL());
+
+		
+			launcher.setBuffer(m_addedHostPairsGPU.getBufferCL());
+			launcher.setBuffer(m_removedHostPairsGPU.getBufferCL());
+			launcher.setBuffer(m_addedCountGPU.getBufferCL());
+			launcher.setBuffer(m_removedCountGPU.getBufferCL());
+			launcher.setConst(maxCapacity);
+			launcher.setConst( numObjects);
+			launcher.launch1D( numObjects);
+			clFinish(m_queue);
+		}
+
+		{
+			B3_PROFILE("copy to host");
+			int addedCountGPU = m_addedCountGPU.at(0);
+			m_addedHostPairsGPU.resize(addedCountGPU);
+			m_addedHostPairsGPU.copyToHost(addedHostPairs);
+
+			//printf("addedCountGPU=%d\n",addedCountGPU);
+			int removedCountGPU = m_removedCountGPU.at(0);
+			m_removedHostPairsGPU.resize(removedCountGPU);
+			m_removedHostPairsGPU.copyToHost(removedHostPairs);
+			//printf("removedCountGPU=%d\n",removedCountGPU);
+
+		}
+
+
+
+	} 
+	else
+	{
+		int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
+
+		B3_PROFILE("actual search");
+		for (int i=0;i<numObjects;i++)
+		{
+			//int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size();
+			//int checkObjects[]={40,53};
+			//int numCheckObjects = sizeof(checkObjects)/sizeof(int);
+			
+			//for (int a=0;a<numCheckObjects ;a++)
+			
+			for (int axis=0;axis<3;axis++)
+			{
+				//int i = checkObjects[a];
+
+				unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x;
+				unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y;
+				unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].x;
+				int dmin = curMinIndex - prevMinIndex;
+				
+				unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][i].y;
+
+				
+
+				int dmax = curMaxIndex - prevMaxIndex;
+				if (dmin!=0)
+				{
+					//printf("for object %d, dmin=%d\n",i,dmin);
+				}
+				if (dmax!=0)
+				{
+					//printf("for object %d, dmax=%d\n",i,dmax);
+				}
+				for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)
+				{
+					if (dmin!=0)
+					{
+						int stepMin = dmin<0 ? -1 : 1;
+						for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)
+						{
+							int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
+							int otherIndex = otherIndex2/2;
+							if (otherIndex!=i)
+							{
+								bool otherIsMax = ((otherIndex2&1)!=0);
+
+								if (otherIsMax)
+								{
+									//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
+									//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
+									
+									bool overlap = true;
+
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
+											overlap=false;
+									}
+
+								//	b3Assert(overlap2==overlap);
+
+									bool prevOverlap = true;
+
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x))
+											prevOverlap=false;
+									}
+									
+
+									//b3Assert(overlap==overlap2);
+								
+
+
+									if (dmin<0)
+									{
+										if (overlap && !prevOverlap)
+										{
+											//add a pair
+											b3Int2 newPair;
+											if (i<=otherIndex)
+											{
+												newPair.x = i;
+												newPair.y = otherIndex;
+											} else
+											{
+												newPair.x = otherIndex;
+												newPair.y = i;
+											}
+											addedHostPairs.push_back(newPair);
+										}
+									} 
+									else
+									{
+										if (!overlap && prevOverlap)
+										{
+									
+											//remove a pair
+											b3Int2 removedPair;
+											if (i<=otherIndex)
+											{
+												removedPair.x = i;
+												removedPair.y = otherIndex;
+											} else
+											{
+												removedPair.x = otherIndex;
+												removedPair.y = i;
+											}
+											removedHostPairs.push_back(removedPair);
+										}
+									}//otherisMax
+								}//if (dmin<0)
+							}//if (otherIndex!=i)
+						}//for (int j=
+					}
+				
+					if (dmax!=0)
+					{
+						int stepMax = dmax<0 ? -1 : 1;
+						for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)
+						{
+							int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
+							int otherIndex = otherIndex2/2;
+							if (otherIndex!=i)
+							{
+								bool otherIsMin = ((otherIndex2&1)==0);
+								//if (otherIsMin)
+								{
+									//bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
+									//bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
+									
+									bool overlap = true;
+
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
+											overlap=false;
+									}
+									//b3Assert(overlap2==overlap);
+
+									bool prevOverlap = true;
+
+									for (int ax=0;ax<3;ax++)
+									{
+										if ((m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].y) ||
+											(m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1-m_currentBuffer][otherIndex].x))
+											prevOverlap=false;
+									}
+									
+
+									if (dmax>0)
+									{
+										if (overlap && !prevOverlap)
+										{
+											//add a pair
+											b3Int2 newPair;
+											if (i<=otherIndex)
+											{
+												newPair.x = i;
+												newPair.y = otherIndex;
+											} else
+											{
+												newPair.x = otherIndex;
+												newPair.y = i;
+											}
+											addedHostPairs.push_back(newPair);
+							
+										}
+									} 
+									else
+									{
+										if (!overlap && prevOverlap)
+										{
+											//if (otherIndex2&1==0) -> min?
+											//remove a pair
+											b3Int2 removedPair;
+											if (i<=otherIndex)
+											{
+												removedPair.x = i;
+												removedPair.y = otherIndex;
+											} else
+											{
+												removedPair.x = otherIndex;
+												removedPair.y = i;
+											}
+											removedHostPairs.push_back(removedPair);
+								
+										}
+									}
+							
+								}//if (dmin<0)
+							}//if (otherIndex!=i)
+						}//for (int j=
+					}
+				}//for (int otherbuffer
+			}//for (int axis=0;
+		}//for (int i=0;i<numObjects
+	}
+
+	//remove duplicates and add/remove then to existing m_overlappingPairs
+	
+	
+	
+	{
+		{
+			B3_PROFILE("sort allPairs");
+			allPairs.quickSort(b3PairCmp);
+		}
+		{
+			B3_PROFILE("sort addedHostPairs");
+			addedHostPairs.quickSort(b3PairCmp);
+		}
+		{
+			B3_PROFILE("sort removedHostPairs");
+			removedHostPairs.quickSort(b3PairCmp);
+		}
+	}
+
+	b3Int2 prevPair;
+	prevPair.x = -1;
+	prevPair.y = -1;
+	
+	int uniqueRemovedPairs = 0;
+
+	b3AlignedObjectArray<int> removedPositions;
+
+	{
+		B3_PROFILE("actual removing");
+		for (int i=0;i<removedHostPairs.size();i++)
+		{
+			b3Int2 removedPair = removedHostPairs[i];
+			if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y))
+			{
+
+			int index1 = allPairs.findBinarySearch(removedPair);
+
+	//#ifdef _DEBUG
+				
+				
+	
+				int index2 = allPairs.findLinearSearch(removedPair);
+				b3Assert(index1==index2);
+	
+				//b3Assert(index1!=allPairs.size());
+				if (index1<allPairs.size())
+	//#endif//_DEBUG
+				{
+					uniqueRemovedPairs++;
+					removedPositions.push_back(index1);
+					{
+						//printf("framepje(%d) remove pair(%d):%d,%d\n",framepje,i,removedPair.x,removedPair.y);
+					}
+				}
+			}
+			prevPair = removedPair;
+		}
+
+		if (uniqueRemovedPairs)
+		{
+			for (int i=0;i<removedPositions.size();i++)
+			{
+				allPairs[removedPositions[i]].x = INT_MAX ;
+				allPairs[removedPositions[i]].y = INT_MAX ;
+			}
+			allPairs.quickSort(b3PairCmp);
+			allPairs.resize(allPairs.size()-uniqueRemovedPairs);
+		}
+	}
+	//if (uniqueRemovedPairs)
+	//	printf("uniqueRemovedPairs=%d\n",uniqueRemovedPairs);
+	//printf("removedHostPairs.size = %d\n",removedHostPairs.size());
+
+	prevPair.x = -1;
+	prevPair.y = -1;
+	
+	int uniqueAddedPairs=0;
+	b3AlignedObjectArray<b3Int2> actualAddedPairs;
+
+	{
+		B3_PROFILE("actual adding");
+		for (int i=0;i<addedHostPairs.size();i++)
+		{
+			b3Int2 newPair = addedHostPairs[i];
+			if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y))
+			{
+//#ifdef _DEBUG		
+				int index1 = allPairs.findBinarySearch(newPair);
+				
+	
+				int index2 = allPairs.findLinearSearch(newPair);
+				b3Assert(index1==index2);
+	
+
+				b3Assert(index1==allPairs.size());
+				if (index1!=allPairs.size())
+				{
+					printf("??\n");
+				}
+
+				if (index1==allPairs.size())
+//#endif //_DEBUG
+				{
+					uniqueAddedPairs++;
+					actualAddedPairs.push_back(newPair);
+				}
+			}
+			prevPair = newPair;
+		}
+		for (int i=0;i<actualAddedPairs.size();i++)
+		{
+			//printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y);
+			allPairs.push_back(actualAddedPairs[i]);
+		}
+	}
+	
+	//if (uniqueAddedPairs)
+	//	printf("uniqueAddedPairs=%d\n", uniqueAddedPairs);
+
+
+
+	//b3AlignedObjectArray<b3Int2> addedHostPairs;
+	//b3AlignedObjectArray<b3Int2> removedHostPairs;
+	{
+		B3_PROFILE("m_overlappingPairs.copyFromHost");
+		m_overlappingPairs.copyFromHost(allPairs);
+	}
 
 	
 }
@@ -177,8 +845,8 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
 void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
 {
 	//test
-	//if (m_currentBuffer>=0)
-	//	calculateOverlappingPairsHostIncremental3Sap();
+	if (m_currentBuffer>=0)
+		return calculateOverlappingPairsHostIncremental3Sap();
 
 	b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
 	m_allAabbsGPU.copyToHost(m_allAabbsCPU);
@@ -203,6 +871,7 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
 	
 	int axis=0;
 	{
+		B3_PROFILE("CPU compute best variance axis");
 		b3Vector3 s(0,0,0),s2(0,0,0);
 		int numRigidBodies = numSmallAabbs;
 
@@ -250,8 +919,17 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
 					(b3Vector3&)m_smallAabbsCPU[j].m_min,(b3Vector3&)m_smallAabbsCPU[j].m_max))
 				{
 					b3Int2 pair;
-					pair.x = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
-					pair.y = m_smallAabbsCPU[j].m_minIndices[3];
+					int a = m_smallAabbsCPU[i].m_minIndices[3];
+					int b = m_smallAabbsCPU[j].m_minIndices[3];
+					if (a<=b)
+					{
+						pair.x = a;//store the original index in the unsorted aabb array
+						pair.y = b;
+					} else
+					{
+						pair.x = b;//store the original index in the unsorted aabb array
+						pair.y = a;
+					}
 					hostPairs.push_back(pair);
 				}
 			}
@@ -272,8 +950,18 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
 					(b3Vector3&)m_largeAabbsCPU[j].m_min,(b3Vector3&)m_largeAabbsCPU[j].m_max))
 				{
 					b3Int2 pair;
-					pair.x = m_largeAabbsCPU[j].m_minIndices[3];
-					pair.y = m_smallAabbsCPU[i].m_minIndices[3];//store the original index in the unsorted aabb array
+					int a = m_largeAabbsCPU[j].m_minIndices[3];
+					int b = m_smallAabbsCPU[i].m_minIndices[3];
+					if (a<=b)
+					{
+						pair.x = a; 
+						pair.y = b;//store the original index in the unsorted aabb array
+					} else
+					{
+						pair.x = b;
+						pair.y = a;//store the original index in the unsorted aabb array
+					}
+					
 					hostPairs.push_back(pair);
 				}
 			}
@@ -293,7 +981,7 @@ void  b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
 		m_overlappingPairs.resize(0);
 	}
 
-	//init3dSap();
+	init3dSap();
 
 }
 
@@ -312,6 +1000,9 @@ void  b3GpuSapBroadphase::reset()
 
 void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
 {
+	//if (m_currentBuffer>=0)
+	//	return calculateOverlappingPairsHostIncremental3Sap();
+
 	B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs");
 
 	int axis = 0;
@@ -363,7 +1054,7 @@ void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
 
 	if (m_prefixScanFloat4)
 	{
-		B3_PROFILE("compute best variance axis");
+		B3_PROFILE("GPU compute best variance axis");
 		int numSmallAabbs = m_smallAabbsGPU.size();
 		if (m_dst.size()!=(numSmallAabbs+1))
 		{
@@ -609,7 +1300,7 @@ void  b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
 		
 	}//B3_PROFILE("GPU_RADIX SORT");
 
-	
+	//init3dSap();
 }
 
 void b3GpuSapBroadphase::writeAabbsToGpu()
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
index 262ad7ce8..92c8d6169 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3GpuSapBroadphase.h
@@ -7,7 +7,7 @@ class b3Vector3;
 #include "Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h"
 
 #include "b3SapAabb.h"
-
+#include "Bullet3Common/b3Int2.h"
 
 
 class b3GpuSapBroadphase
@@ -22,11 +22,33 @@ class b3GpuSapBroadphase
 	cl_kernel				m_sapKernel;
 	cl_kernel				m_sap2Kernel;
 	cl_kernel				m_prepareSumVarianceKernel;
+	cl_kernel				m_computePairsIncremental3dSapKernel;
 
 	class b3RadixSort32CL* m_sorter;
 
 	///test for 3d SAP
 	b3AlignedObjectArray<b3SortData>		m_sortedAxisCPU[3][2];
+	b3AlignedObjectArray<b3UnsignedInt2>	m_objectMinMaxIndexCPU[3][2];
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis0prev;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis1prev;
+	b3OpenCLArray<b3UnsignedInt2>			m_objectMinMaxIndexGPUaxis2prev;
+
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU0prev;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU1prev;
+	b3OpenCLArray<b3SortData>				m_sortedAxisGPU2prev;
+
+
+	b3OpenCLArray<b3Int2>					m_addedHostPairsGPU;
+	b3OpenCLArray<b3Int2>					m_removedHostPairsGPU;
+	b3OpenCLArray<int>						m_addedCountGPU;
+	b3OpenCLArray<int>						m_removedCountGPU;
+	
 	int	m_currentBuffer;
 
 	public:
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h b/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
index 81661db6f..7f971ba54 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
+++ b/src/Bullet3OpenCL/BroadphaseCollision/b3SapAabb.h
@@ -1,7 +1,9 @@
 #ifndef B3_SAP_AABB_H
 #define B3_SAP_AABB_H
 
-struct b3SapAabb
+#include "Bullet3Common/b3Scalar.h"
+
+B3_ATTRIBUTE_ALIGNED16(struct) b3SapAabb
 {
 	union
 	{
diff --git a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
index 4bf018384..93fd929e3 100644
--- a/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
+++ b/src/Bullet3OpenCL/BroadphaseCollision/kernels/sapFast.cl
@@ -30,6 +30,22 @@ typedef struct
 	};
 } btAabbCL;
 
+typedef struct 
+{
+	union
+	{
+		unsigned int m_key;
+		unsigned int x;
+	};
+
+	union
+	{
+		unsigned int m_value;
+		unsigned int y;
+		
+	};
+}b3SortData;
+
 
 /// conservative test for overlap between two aabbs
 bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2);
@@ -46,6 +62,260 @@ bool TestAabbAgainstAabb2(const btAabbCL* aabb1, __local const btAabbCL* aabb2)
 	return overlap;
 }
 
+__kernel void   computePairsIncremental3dSapKernel( __global const uint2* objectMinMaxIndexGPUaxis0,
+													__global const uint2* objectMinMaxIndexGPUaxis1,
+													__global const uint2* objectMinMaxIndexGPUaxis2,
+													__global const uint2* objectMinMaxIndexGPUaxis0prev,
+													__global const uint2* objectMinMaxIndexGPUaxis1prev,
+													__global const uint2* objectMinMaxIndexGPUaxis2prev,
+													__global const b3SortData*	   sortedAxisGPU0,
+													__global const b3SortData*	   sortedAxisGPU1,
+													__global const b3SortData*	   sortedAxisGPU2,
+													__global const b3SortData*	   sortedAxisGPU0prev,
+													__global const b3SortData*	   sortedAxisGPU1prev,
+													__global const b3SortData*	   sortedAxisGPU2prev,
+													__global int2*			addedHostPairsGPU,
+													__global int2*			removedHostPairsGPU,
+													volatile __global int*				addedHostPairsCount,
+													volatile __global int*				removedHostPairsCount,
+													int maxCapacity,
+													int numObjects)
+{
+	int i = get_global_id(0);
+	if (i>=numObjects)
+		return;
+
+	__global const uint2* objectMinMaxIndexGPU[3][2];
+	objectMinMaxIndexGPU[0][0]=objectMinMaxIndexGPUaxis0;
+	objectMinMaxIndexGPU[1][0]=objectMinMaxIndexGPUaxis1;
+	objectMinMaxIndexGPU[2][0]=objectMinMaxIndexGPUaxis2;
+	objectMinMaxIndexGPU[0][1]=objectMinMaxIndexGPUaxis0prev;
+	objectMinMaxIndexGPU[1][1]=objectMinMaxIndexGPUaxis1prev;
+	objectMinMaxIndexGPU[2][1]=objectMinMaxIndexGPUaxis2prev;
+
+	__global const b3SortData* sortedAxisGPU[3][2];
+	sortedAxisGPU[0][0] = sortedAxisGPU0;
+	sortedAxisGPU[1][0] = sortedAxisGPU1;
+	sortedAxisGPU[2][0] = sortedAxisGPU2;
+	sortedAxisGPU[0][1] = sortedAxisGPU0prev;
+	sortedAxisGPU[1][1] = sortedAxisGPU1prev;
+	sortedAxisGPU[2][1] = sortedAxisGPU2prev;
+
+	int m_currentBuffer = 0;
+
+	for (int axis=0;axis<3;axis++)
+	{
+		//int i = checkObjects[a];
+
+		unsigned int curMinIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].x;
+		unsigned int curMaxIndex = objectMinMaxIndexGPU[axis][m_currentBuffer][i].y;
+		unsigned int prevMinIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].x;
+		int dmin = curMinIndex - prevMinIndex;
+				
+		unsigned int prevMaxIndex = objectMinMaxIndexGPU[axis][1-m_currentBuffer][i].y;
+
+		int dmax = curMaxIndex - prevMaxIndex;
+	
+		for (int otherbuffer = 0;otherbuffer<2;otherbuffer++)
+		{
+			if (dmin!=0)
+			{
+				int stepMin = dmin<0 ? -1 : 1;
+				for (int j=prevMinIndex;j!=curMinIndex;j+=stepMin)
+				{
+					int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;
+					int otherIndex = otherIndex2/2;
+					if (otherIndex!=i)
+					{
+						bool otherIsMax = ((otherIndex2&1)!=0);
+
+						if (otherIsMax)
+						{
+									
+							bool overlap = true;
+
+							for (int ax=0;ax<3;ax++)
+							{
+								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||
+									(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))
+									overlap=false;
+							}
+
+						//	b3Assert(overlap2==overlap);
+
+							bool prevOverlap = true;
+
+							for (int ax=0;ax<3;ax++)
+							{
+								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||
+									(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))
+									prevOverlap=false;
+							}
+									
+
+							//b3Assert(overlap==overlap2);
+								
+
+
+							if (dmin<0)
+							{
+								if (overlap && !prevOverlap)
+								{
+									//add a pair
+									int2 newPair;
+									if (i<=otherIndex)
+									{
+										newPair.x = i;
+										newPair.y = otherIndex;
+									} else
+									{
+										newPair.x = otherIndex;
+										newPair.y = i;
+									}
+									
+									{
+										int curPair = atomic_inc(addedHostPairsCount);
+										if (curPair<maxCapacity)
+										{
+											
+											addedHostPairsGPU[curPair].x = newPair.x;
+											addedHostPairsGPU[curPair].y = newPair.y;
+										}
+									}
+
+								}
+							} 
+							else
+							{
+								if (!overlap && prevOverlap)
+								{
+									
+									//remove a pair
+									int2 removedPair;
+									if (i<=otherIndex)
+									{
+										removedPair.x = i;
+										removedPair.y = otherIndex;
+									} else
+									{
+										removedPair.x = otherIndex;
+										removedPair.y = i;
+									}
+									{
+										int curPair = atomic_inc(removedHostPairsCount);
+										if (curPair<maxCapacity)
+										{
+											
+											removedHostPairsGPU[curPair].x = removedPair.x;
+											removedHostPairsGPU[curPair].y = removedPair.y;
+
+										}
+									}
+								}
+							}//otherisMax
+						}//if (dmin<0)
+					}//if (otherIndex!=i)
+				}//for (int j=
+			}
+				
+			if (dmax!=0)
+			{
+				int stepMax = dmax<0 ? -1 : 1;
+				for (int j=prevMaxIndex;j!=curMaxIndex;j+=stepMax)
+				{
+					int otherIndex2 = sortedAxisGPU[axis][otherbuffer][j].y;
+					int otherIndex = otherIndex2/2;
+					if (otherIndex!=i)
+					{
+						bool otherIsMin = ((otherIndex2&1)==0);
+						if (otherIsMin)
+						{
+									
+							bool overlap = true;
+
+							for (int ax=0;ax<3;ax++)
+							{
+								if ((objectMinMaxIndexGPU[ax][m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].y) ||
+									(objectMinMaxIndexGPU[ax][m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][m_currentBuffer][otherIndex].x))
+									overlap=false;
+							}
+							//b3Assert(overlap2==overlap);
+
+							bool prevOverlap = true;
+
+							for (int ax=0;ax<3;ax++)
+							{
+								if ((objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].x > objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].y) ||
+									(objectMinMaxIndexGPU[ax][1-m_currentBuffer][i].y < objectMinMaxIndexGPU[ax][1-m_currentBuffer][otherIndex].x))
+									prevOverlap=false;
+							}
+									
+
+							if (dmax>0)
+							{
+								if (overlap && !prevOverlap)
+								{
+									//add a pair
+									int2 newPair;
+									if (i<=otherIndex)
+									{
+										newPair.x = i;
+										newPair.y = otherIndex;
+									} else
+									{
+										newPair.x = otherIndex;
+										newPair.y = i;
+									}
+									{
+										int curPair = atomic_inc(addedHostPairsCount);
+										if (curPair<maxCapacity)
+										{
+											
+											addedHostPairsGPU[curPair].x = newPair.x;
+											addedHostPairsGPU[curPair].y = newPair.y;
+										}
+									}
+							
+								}
+							} 
+							else
+							{
+								if (!overlap && prevOverlap)
+								{
+									//if (otherIndex2&1==0) -> min?
+									//remove a pair
+									int2 removedPair;
+									if (i<=otherIndex)
+									{
+										removedPair.x = i;
+										removedPair.y = otherIndex;
+									} else
+									{
+										removedPair.x = otherIndex;
+										removedPair.y = i;
+									}
+									{
+										int curPair = atomic_inc(removedHostPairsCount);
+										if (curPair<maxCapacity)
+										{
+											
+											removedHostPairsGPU[curPair].x = removedPair.x;
+											removedHostPairsGPU[curPair].y = removedPair.y;
+										}
+									}
+								
+								}
+							}
+							
+						}//if (dmin<0)
+					}//if (otherIndex!=i)
+				}//for (int j=
+			}
+		}//for (int otherbuffer
+	}//for (int axis=0;
+
+
+}
 
 //computePairsKernelBatchWrite
 __kernel void   computePairsKernel( __global const btAabbCL* aabbs, volatile __global int2* pairsOut,volatile  __global int* pairCount, int numObjects, int axis, int maxPairs)
diff --git a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
index 0adebb947..975bd80e5 100644
--- a/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
+++ b/src/Bullet3OpenCL/ParallelPrimitives/b3RadixSort32CL.h
@@ -8,14 +8,15 @@ struct b3SortData
 {
 	union
 	{
-		int m_key;
-		int x;
+		unsigned int m_key;
+		unsigned int x;
 	};
 
 	union
 	{
-		int m_value;
-		int y;
+		unsigned int m_value;
+		unsigned int y;
+		
 	};
 };
 #include "b3BufferInfoCL.h"