added Samurai Monastry wavefront .obj

added alternative batching kernel (slow) tweaked controls a bit added command-line options --selected_demo=<int> and --new_batching started looking into parallel 3d sap
2013-03-23 23:00:50 -07:00
parent 11d934b63a
commit 68062bdfbd
18 changed files with 333195 additions and 76 deletions
--- a/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
+++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.cpp
@@ -20,7 +20,8 @@ m_smallAabbsGPU(ctx,q),
 m_largeAabbsGPU(ctx,q),
 m_overlappingPairs(ctx,q),
 m_gpuSmallSortData(ctx,q),
-m_gpuSmallSortedAabbs(ctx,q)
+m_gpuSmallSortedAabbs(ctx,q),
+m_currentBuffer(-1)
 {
 	const char* sapSrc = sapCL;
    const char* sapFastSrc = sapFastCL;
@@ -86,18 +87,87 @@ static bool TestAabbAgainstAabb2(const btVector3 &aabbMin1, const btVector3 &aab
 	return overlap;
 }

-void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
+
+
+//http://stereopsis.com/radix.html
+static unsigned int FloatFlip(float fl)
 {
-	int axis = 0;//todo on GPU for now hardcode
+	unsigned int f = *(unsigned int*)&fl;
+	unsigned int mask = -(int)(f >> 31) | 0x80000000;
+	return f ^ mask;
+};
+
+void  btGpuSapBroadphase::init3dSap()
+{
+	if (m_currentBuffer<0)
+	{
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+
+		m_currentBuffer = 0;
+		for (int axis=0;axis<3;axis++)
+		{
+			for (int buf=0;buf<2;buf++)
+			{
+				int totalNumAabbs = m_allAabbsCPU.size();
+				m_sortedAxisCPU[axis][buf].resize(totalNumAabbs);
+
+				if (buf==m_currentBuffer)
+				{
+					for (int i=0;i<totalNumAabbs;i++)
+					{
+						m_sortedAxisCPU[axis][buf][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
+						m_sortedAxisCPU[axis][buf][i].m_value = i;
+					}
+				}
+			}
+		}
+	}
+}
+void  btGpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
+{
+	btAssert(m_currentBuffer>=0);
+	if (m_currentBuffer<0)
+		return;
+	
+	m_allAabbsGPU.copyToHost(m_allAabbsCPU);
+
+	for (int axis=0;axis<3;axis++)
+	{
+		for (int buf=0;buf<2;buf++)
+		{
+			btAssert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size());
+		}
+	}
+
+
+	m_currentBuffer = 1-m_currentBuffer;
+	
+	for (int axis=0;axis<3;axis++)
+	{
+		int totalNumAabbs = m_allAabbsCPU.size();
+		for (int i=0;i<totalNumAabbs;i++)
+		{
+			m_sortedAxisCPU[axis][m_currentBuffer][i].m_key = FloatFlip(m_allAabbsCPU[i].m_minIndices[axis]);
+			m_sortedAxisCPU[axis][m_currentBuffer][i].m_value = i;
+		}
+	}
+
+	
+}
+
+void  btGpuSapBroadphase::calculateOverlappingPairsHost()
+{
+	//test
+	//if (m_currentBuffer>=0)
+	//	calculateOverlappingPairsHostIncremental3Sap();
+
+	int axis=0;

 	btAssert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
 	

-	if (forceHost)
-	{
-
-	btAlignedObjectArray<btSapAabb> allHostAabbs;
-	m_allAabbsGPU.copyToHost(allHostAabbs);
+	
+	m_allAabbsGPU.copyToHost(m_allAabbsCPU);
 	
 	{
 		int numSmallAabbs = m_smallAabbsCPU.size();
@@ -105,7 +175,7 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 		{
 			//sync aabb
 			int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
-			m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
+			m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
 			m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
 		}
 	}
@@ -116,7 +186,7 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 		{
 			//sync aabb
 			int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
-			m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
+			m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
 			m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;

 		}
@@ -175,8 +245,15 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 		m_overlappingPairs.resize(0);
 	}

-	return;
-	}
+	//init3dSap();
+
+}
+
+void  btGpuSapBroadphase::calculateOverlappingPairs()
+{
+	int axis = 0;//todo on GPU for now hardcode
+
+

 	{

@@ -185,8 +262,8 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 	if (syncOnHost)
 	{
 		BT_PROFILE("Synchronize m_smallAabbsGPU (CPU/slow)");
-		btAlignedObjectArray<btSapAabb> allHostAabbs;
-		m_allAabbsGPU.copyToHost(allHostAabbs);
+		
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);

 		m_smallAabbsGPU.copyToHost(m_smallAabbsCPU);
 		{
@@ -195,7 +272,7 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 			{
 				//sync aabb
 				int aabbIndex = m_smallAabbsCPU[j].m_signedMaxIndices[3];
-				m_smallAabbsCPU[j] = allHostAabbs[aabbIndex];
+				m_smallAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
 				m_smallAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
 			}
 		}
@@ -226,8 +303,8 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 	if (syncOnHost)
 	{
 		BT_PROFILE("Synchronize m_largeAabbsGPU (CPU/slow)");
-		btAlignedObjectArray<btSapAabb> allHostAabbs;
-		m_allAabbsGPU.copyToHost(allHostAabbs);
+		
+		m_allAabbsGPU.copyToHost(m_allAabbsCPU);

 		m_largeAabbsGPU.copyToHost(m_largeAabbsCPU);
 		{
@@ -236,7 +313,7 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 			{
 				//sync aabb
 				int aabbIndex = m_largeAabbsCPU[j].m_signedMaxIndices[3];
-				m_largeAabbsCPU[j] = allHostAabbs[aabbIndex];
+				m_largeAabbsCPU[j] = m_allAabbsCPU[aabbIndex];
 				m_largeAabbsCPU[j].m_signedMaxIndices[3] = aabbIndex;
 			}
 		}
@@ -432,6 +509,7 @@ void  btGpuSapBroadphase::calculateOverlappingPairs(bool forceHost)
 		
 	}//BT_PROFILE("GPU_RADIX SORT");

+	
 }

 void btGpuSapBroadphase::writeAabbsToGpu()
--- a/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
+++ b/opencl/gpu_broadphase/host/btGpuSapBroadphase.h
@@ -24,6 +24,10 @@ class btGpuSapBroadphase

 	class btRadixSort32CL* m_sorter;

+	///test for 3d SAP
+	btAlignedObjectArray<btSortData>		m_sortedAxisCPU[3][2];
+	int	m_currentBuffer;
+
 	public:
 	
 	btOpenCLArray<btSapAabb>	m_allAabbsGPU;
@@ -45,7 +49,11 @@ class btGpuSapBroadphase
 	btGpuSapBroadphase(cl_context ctx,cl_device_id device, cl_command_queue  q );
 	virtual ~btGpuSapBroadphase();
 	
-	void  calculateOverlappingPairs(bool forceHost=false);
+	void  calculateOverlappingPairs();
+	void  calculateOverlappingPairsHost();
+
+	void init3dSap();
+	void calculateOverlappingPairsHostIncremental3Sap();

 	void createProxy(const btVector3& aabbMin,  const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
 	void createLargeProxy(const btVector3& aabbMin,  const btVector3& aabbMax, int userPtr ,short int collisionFilterGroup,short int collisionFilterMask);
--- a/opencl/gpu_rigidbody/host/Solver.cpp
+++ b/opencl/gpu_rigidbody/host/Solver.cpp
@@ -16,6 +16,8 @@ subject to the following restrictions:

 #include "Solver.h"

+///useNewBatchingKernel  is a rewritten kernel using just a single thread of the warp, for experiments
+bool useNewBatchingKernel = false;

 #define SOLVER_SETUP_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solverSetup.cl"
 #define SOLVER_SETUP2_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solverSetup2.cl"
@@ -24,6 +26,7 @@ subject to the following restrictions:
 #define SOLVER_FRICTION_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solveFriction.cl"

 #define BATCHING_PATH "opencl/gpu_rigidbody/kernels/batchingKernels.cl"
+#define BATCHING_NEW_PATH "opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl"


 #include "../kernels/solverSetup.h"
@@ -33,6 +36,9 @@ subject to the following restrictions:
 #include "../kernels/solveFriction.h"

 #include "../kernels/batchingKernels.h"
+#include "../kernels/batchingKernelsNew.h"
+
+
 #include "BulletCommon/btQuickprof.h"
 #include "../../parallel_primitives/host/btLauncherCL.h"
 #include "BulletCommon/btVector3.h"
@@ -94,7 +100,7 @@ Solver::Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int
 	const int sortSize = BTNEXTMULTIPLEOF( pairCapacity, 512 );

 	m_sortDataBuffer = new btOpenCLArray<btSortData>(ctx,queue,sortSize);
-	m_contactBuffer = new btOpenCLArray<btContact4>(ctx,queue);
+	m_contactBuffer2 = new btOpenCLArray<btContact4>(ctx,queue);

 	m_numConstraints = new btOpenCLArray<unsigned int>(ctx,queue,N_SPLIT*N_SPLIT );
 	m_numConstraints->resize(N_SPLIT*N_SPLIT);
@@ -108,6 +114,8 @@ Solver::Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int

 	cl_int pErrNum;
 	const char* batchKernelSource = batchingKernelsCL;
+	const char* batchKernelNewSource = batchingKernelsNewCL;
+	
 	const char* solverSetupSource = solverSetupCL;
 	const char* solverSetup2Source = solverSetup2CL;
 	const char* solveContactSource = solveContactCL;
@@ -159,13 +167,20 @@ Solver::Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int
 		m_batchingKernel = btOpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg,additionalMacros );
 		btAssert(m_batchingKernel);
 	}
-			
+	{
+		cl_program batchingNewProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, BATCHING_NEW_PATH);
+		btAssert(batchingNewProg);
+	
+		m_batchingKernelNew = btOpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
+		//m_batchingKernelNew = btOpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
+		btAssert(m_batchingKernelNew);
+	}
 }
 		
 Solver::~Solver()
 {
 	delete m_sortDataBuffer;
-	delete m_contactBuffer;
+	delete m_contactBuffer2;

 	delete m_sort32;
 	delete m_scan;
@@ -173,6 +188,7 @@ Solver::~Solver()


 	clReleaseKernel(m_batchingKernel);
+	clReleaseKernel(m_batchingKernelNew);
 	
 	clReleaseKernel( m_solveContactKernel);
 	clReleaseKernel( m_solveFrictionKernel);
@@ -843,7 +859,7 @@ void	Solver::batchContacts(  btOpenCLArray<btContact4>* contacts, int nContacts,

 		btBufferInfoCL bInfo[] = { 
 			btBufferInfoCL( contacts->getBufferCL() ), 
-			btBufferInfoCL( m_contactBuffer->getBufferCL() ), 
+			btBufferInfoCL(  m_contactBuffer2->getBufferCL()),
 			btBufferInfoCL( nNative->getBufferCL() ), 
 			btBufferInfoCL( offsetsNative->getBufferCL() ),
 #ifdef BATCH_DEBUG
@@ -852,10 +868,22 @@ void	Solver::batchContacts(  btOpenCLArray<btContact4>* contacts, int nContacts,
 		};

 		
+		
+
 		{
 			BT_PROFILE("batchingKernel");
-			btLauncherCL launcher( m_queue, m_batchingKernel);
-			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+			//btLauncherCL launcher( m_queue, m_batchingKernel);
+			cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
+
+			btLauncherCL launcher( m_queue, k);
+			if (!useNewBatchingKernel )
+			{
+				launcher.setBuffer( contacts->getBufferCL() );
+			}
+			launcher.setBuffer( m_contactBuffer2->getBufferCL() );
+			launcher.setBuffer( nNative->getBufferCL());
+			launcher.setBuffer( offsetsNative->getBufferCL());
+
 			//launcher.setConst(  cdata );
            launcher.setConst(staticIdx);
            
@@ -899,7 +927,7 @@ void	Solver::batchContacts(  btOpenCLArray<btContact4>* contacts, int nContacts,
 	}

 //	copy buffer to buffer
-	btAssert(m_contactBuffer->size()==nContacts);
+	//btAssert(m_contactBuffer->size()==nContacts);
 	//contacts->copyFromOpenCLArray( *m_contactBuffer);
 	//clFinish(m_queue);//needed?
 	
--- a/opencl/gpu_rigidbody/host/Solver.h
+++ b/opencl/gpu_rigidbody/host/Solver.h
@@ -94,6 +94,7 @@ class Solver : public SolverBase
 		
 		int m_nIterations;
 		cl_kernel m_batchingKernel;
+		cl_kernel m_batchingKernelNew;
 		cl_kernel m_solveContactKernel;
 		cl_kernel m_solveFrictionKernel;
 		cl_kernel m_contactToConstraintKernel;
@@ -106,7 +107,7 @@ class Solver : public SolverBase
 		class btPrefixScanCL*	m_scan;

 		btOpenCLArray<btSortData>* m_sortDataBuffer;
-		btOpenCLArray<btContact4>* m_contactBuffer;
+		btOpenCLArray<btContact4>* m_contactBuffer2;

 		enum
 		{
--- a/opencl/gpu_rigidbody/host/btConfig.h
+++ b/opencl/gpu_rigidbody/host/btConfig.h
@@ -18,7 +18,7 @@ struct	btConfig
 	int m_maxTriConvexPairCapacity;

 	btConfig()
-		:m_maxConvexBodies(64*1024),
+		:m_maxConvexBodies(32*1024),
 		m_maxConvexShapes(8192),
 		m_maxVerticesPerFace(64),
 		m_maxFacesPerShape(64),
@@ -26,7 +26,7 @@ struct	btConfig
 		m_maxConvexIndices(8192),
 		m_maxConvexUniqueEdges(8192),
 		m_maxCompoundChildShapes(8192),
-		m_maxTriConvexPairCapacity(64*1024)
+		m_maxTriConvexPairCapacity(512*1024)
 	{
 		m_maxBroadphasePairs = 16*m_maxConvexBodies;
 	}
--- a/opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.cpp
+++ b/opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.cpp
@@ -17,12 +17,14 @@
 #define SOLVER_CONTACT_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solveContact.cl"
 #define SOLVER_FRICTION_KERNEL_PATH "opencl/gpu_rigidbody/kernels/solveFriction.cl"
 #define BATCHING_PATH "opencl/gpu_rigidbody/kernels/batchingKernels.cl"
+#define BATCHING_NEW_PATH "opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl"

 #include "../kernels/solverSetup.h"
 #include "../kernels/solverSetup2.h"
 #include "../kernels/solveContact.h"
 #include "../kernels/solveFriction.h"
 #include "../kernels/batchingKernels.h"
+#include "../kernels/batchingKernelsNew.h"



@@ -48,13 +50,13 @@ struct	btGpuBatchingPgsSolverInternalData
 	int m_nIterations;

 	btOpenCLArray<btGpuConstraint4>* m_contactCGPU;
-
 	btOpenCLArray<unsigned int>* m_numConstraints;
 	btOpenCLArray<unsigned int>* m_offsets;
 		
 	Solver*		m_solverGPU;		
 	
 	cl_kernel m_batchingKernel;
+	cl_kernel m_batchingKernelNew;
 	cl_kernel m_solveContactKernel;
 	cl_kernel m_solveFrictionKernel;
 	cl_kernel m_contactToConstraintKernel;
@@ -72,6 +74,11 @@ struct	btGpuBatchingPgsSolverInternalData
 	btOpenCLArray<btRigidBodyCL>* m_bodyBufferGPU;
 	btOpenCLArray<btInertiaCL>* m_inertiaBufferGPU;
 	btOpenCLArray<btContact4>* m_pBufContactOutGPU;
+
+
+	btAlignedObjectArray<unsigned int> m_idxBuffer;
+	btAlignedObjectArray<btSortData> m_sortData;
+	btAlignedObjectArray<btContact4> m_old;
 };


@@ -114,6 +121,7 @@ btGpuBatchingPgsSolver::btGpuBatchingPgsSolver(cl_context ctx,cl_device_id devic

 	cl_int pErrNum;
 	const char* batchKernelSource = batchingKernelsCL;
+	const char* batchKernelNewSource = batchingKernelsNewCL;
 	const char* solverSetupSource = solverSetupCL;
 	const char* solverSetup2Source = solverSetup2CL;
 	const char* solveContactSource = solveContactCL;
@@ -166,7 +174,14 @@ btGpuBatchingPgsSolver::btGpuBatchingPgsSolver(cl_context ctx,cl_device_id devic
 		btAssert(m_data->m_batchingKernel);
 	}
 			
-
+	{
+		cl_program batchingNewProg = btOpenCLUtils::compileCLProgramFromString( ctx, device, batchKernelNewSource, &pErrNum,additionalMacros, BATCHING_NEW_PATH);
+		btAssert(batchingNewProg);
+		
+		m_data->m_batchingKernelNew = btOpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg,additionalMacros );
+		btAssert(m_data->m_batchingKernelNew);
+	}
+		



@@ -186,6 +201,7 @@ btGpuBatchingPgsSolver::~btGpuBatchingPgsSolver()


 	clReleaseKernel(m_data->m_batchingKernel);
+	clReleaseKernel(m_data->m_batchingKernelNew);
 	
 	clReleaseKernel( m_data->m_solveContactKernel);
 	clReleaseKernel( m_data->m_solveFrictionKernel);
@@ -406,7 +422,7 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
        csCfg.m_averageExtent = .2f;//@TODO m_averageObjExtent;
        csCfg.m_staticIdx = 0;//m_static0Index;//m_planeBodyIndex;
        
-        btOpenCLArray<btContact4>* contactsIn = m_data->m_pBufContactOutGPU;
+        
        btOpenCLArray<btRigidBodyCL>* bodyBuf = m_data->m_bodyBufferGPU;

        void* additionalData = 0;//m_data->m_frictionCGPU;
@@ -419,16 +435,17 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
 
        {
            
-            if( m_data->m_solverGPU->m_contactBuffer)
+            if( m_data->m_solverGPU->m_contactBuffer2)
            {
-                m_data->m_solverGPU->m_contactBuffer->resize(nContacts);
+                m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
            }
            
-            if( m_data->m_solverGPU->m_contactBuffer == 0 )
+            if( m_data->m_solverGPU->m_contactBuffer2 == 0 )
            {
-				m_data->m_solverGPU->m_contactBuffer = new btOpenCLArray<btContact4>(m_data->m_context,m_data->m_queue, nContacts );
-                m_data->m_solverGPU->m_contactBuffer->resize(nContacts);
+				m_data->m_solverGPU->m_contactBuffer2 = new btOpenCLArray<btContact4>(m_data->m_context,m_data->m_queue, nContacts );
+                m_data->m_solverGPU->m_contactBuffer2->resize(nContacts);
            }
+			
            clFinish(m_data->m_queue);
            
            
@@ -438,7 +455,7 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
                //@todo: just reserve it, without copy of original contact (unless we use warmstarting)
                
                
-                btOpenCLArray<btContact4>* contactNative  = contactsIn;
+                
                const btOpenCLArray<btRigidBodyCL>* bodyNative = bodyBuf;
                
                
@@ -477,7 +494,7 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
                            m_data->m_solverGPU->m_sortDataBuffer->resize(nContacts);
                            
                            
-                            btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL()), btBufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
+                            btBufferInfoCL bInfo[] = { btBufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), btBufferInfoCL( bodyBuf->getBufferCL()), btBufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
                            btLauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_setSortDataKernel );
                            launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
                            launcher.setConst( cdata.m_nContacts );
@@ -536,7 +553,7 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
                                btInt4 cdata;
                                cdata.x = nContacts;
                                
-                                btBufferInfoCL bInfo[] = { btBufferInfoCL( contactNative->getBufferCL() ), btBufferInfoCL( m_data->m_solverGPU->m_contactBuffer->getBufferCL())
+								btBufferInfoCL bInfo[] = { btBufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ), btBufferInfoCL( m_data->m_solverGPU->m_contactBuffer2->getBufferCL())
                                    , btBufferInfoCL( m_data->m_solverGPU->m_sortDataBuffer->getBufferCL()) };
                                btLauncherCL launcher(m_data->m_queue,m_data->m_solverGPU->m_reorderContactKernel);
                                launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
@@ -554,19 +571,18 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
                
                clFinish(m_data->m_queue);
                
+				
 				if (nContacts)
-                {
-                    BT_PROFILE("gpu m_copyConstraintKernel");
-                    
-                    btInt4 cdata; cdata.x = nContacts;
-                    btBufferInfoCL bInfo[] = { btBufferInfoCL(  m_data->m_solverGPU->m_contactBuffer->getBufferCL() ), btBufferInfoCL( contactNative->getBufferCL() ) };
-                    btLauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel );
-                    launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
-                    launcher.setConst(  cdata );
-                    launcher.launch1D( nContacts, 64 );
-                    clFinish(m_data->m_queue);
-                }
-                
+				{
+					BT_PROFILE("gpu m_copyConstraintKernel");
+					btInt4 cdata; cdata.x = nContacts;
+					btBufferInfoCL bInfo[] = { btBufferInfoCL(  m_data->m_solverGPU->m_contactBuffer2->getBufferCL() ), btBufferInfoCL( m_data->m_pBufContactOutGPU->getBufferCL() ) };
+					btLauncherCL launcher(m_data->m_queue, m_data->m_solverGPU->m_copyConstraintKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(btBufferInfoCL) );
+					launcher.setConst(  cdata );
+					launcher.launch1D( nContacts, 64 );
+					clFinish(m_data->m_queue);
+				}
                
                
                bool compareGPU = false;
@@ -575,13 +591,13 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
 					if (gpuBatchContacts)
 					{
 						BT_PROFILE("gpu batchContacts");
-						maxNumBatches = 50;
-						m_data->m_solverGPU->batchContacts( (btOpenCLArray<btContact4>*)contactNative, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
+						maxNumBatches = 25;//250;
+						m_data->m_solverGPU->batchContacts( m_data->m_pBufContactOutGPU, nContacts, m_data->m_solverGPU->m_numConstraints, m_data->m_solverGPU->m_offsets, csCfg.m_staticIdx );
 					} else
 					{
 						BT_PROFILE("cpu batchContacts");
 						btAlignedObjectArray<btContact4> cpuContacts;
-						btOpenCLArray<btContact4>* contactsIn = m_data->m_pBufContactOutGPU;
+						btOpenCLArray<btContact4>* contactsIn = m_data->m_solverGPU->m_contactBuffer2;
 						contactsIn->copyToHost(cpuContacts);
                    
 						btOpenCLArray<unsigned int>* countsNative = m_data->m_solverGPU->m_numConstraints;
@@ -611,8 +627,11 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
 									numNonzeroGrid++;
 									//printf("cpu batch\n");
                                
-									int simdWidth = 32;
-									int numBatches = sortConstraintByBatch( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies);	//	on GPU
+									
+									int simdWidth =64;//-1;//32;
+									int numBatches = sortConstraintByBatch3( &cpuContacts[0]+offset, n, simdWidth,csCfg.m_staticIdx ,numBodies);	//	on GPU
+									
+
 									maxNumBatches = btMax(numBatches,maxNumBatches);
                                
 									clFinish(m_data->m_queue);
@@ -622,7 +641,7 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
 						}
 						{
 							BT_PROFILE("m_contactBuffer->copyFromHost");
-							m_data->m_solverGPU->m_contactBuffer->copyFromHost((btAlignedObjectArray<btContact4>&)cpuContacts);
+							m_data->m_solverGPU->m_contactBuffer2->copyFromHost((btAlignedObjectArray<btContact4>&)cpuContacts);
 						}
 						
 					} 
@@ -636,7 +655,7 @@ void btGpuBatchingPgsSolver::solveContacts(int numBodies, cl_mem bodyBuf, cl_mem
                {
                    //BT_PROFILE("gpu convertToConstraints");
 					m_data->m_solverGPU->convertToConstraints( bodyBuf, 
-						shapeBuf, m_data->m_solverGPU->m_contactBuffer /*contactNative*/, 
+						shapeBuf, m_data->m_solverGPU->m_contactBuffer2,
 						contactConstraintOut, 
 						additionalData, nContacts, 
 						(SolverBase::ConstraintCfg&) csCfg );
@@ -700,6 +719,13 @@ static bool sortfnc(const btSortData& a,const btSortData& b)
 	return (a.m_key<b.m_key);
 }

+
+
+btAlignedObjectArray<int> bodyUsed;
+
+
+
+
 btAlignedObjectArray<unsigned int> idxBuffer;
 btAlignedObjectArray<btSortData> sortData;
 btAlignedObjectArray<btContact4> old;
@@ -830,3 +856,301 @@ inline int btGpuBatchingPgsSolver::sortConstraintByBatch( btContact4* cs, int n,
 #endif
 	return batchIdx;
 }
+
+
+inline int btGpuBatchingPgsSolver::sortConstraintByBatch2( btContact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies)
+{
+	
+	BT_PROFILE("sortConstraintByBatch");
+	
+
+
+	bodyUsed.resize(2*simdWidth);
+
+	for (int q=0;q<2*simdWidth;q++)
+		bodyUsed[q]=0;
+
+	int curBodyUsed = 0;
+
+	int numIter = 0;
+    
+	m_data->m_sortData.resize(numConstraints);
+	m_data->m_idxBuffer.resize(numConstraints);
+	m_data->m_old.resize(numConstraints);
+	
+	unsigned int* idxSrc = &m_data->m_idxBuffer[0];
+		
+#if defined(_DEBUG)
+	for(int i=0; i<numConstraints; i++)
+		cs[i].getBatchIdx() = -1;
+#endif
+	for(int i=0; i<numConstraints; i++) 
+		idxSrc[i] = i;
+    
+	int numValidConstraints = 0;
+	int unprocessedConstraintIndex = 0;
+
+	int batchIdx = 0;
+    
+
+	{
+		BT_PROFILE("cpu batch innerloop");
+		
+		while( numValidConstraints < numConstraints)
+		{
+			numIter++;
+			int nCurrentBatch = 0;
+			//	clear flag
+			for(int i=0; i<curBodyUsed; i++) 
+				bodyUsed[i] = 0;
+            curBodyUsed = 0;
+
+			for(int i=numValidConstraints; i<numConstraints; i++)
+			{
+				int idx = idxSrc[i];
+				btAssert( idx < numConstraints );
+				//	check if it can go
+				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+				int aUnavailable = 0;
+				int bUnavailable = 0;
+				if (!aIsStatic)
+				{
+					for (int j=0;j<curBodyUsed;j++)
+					{
+						if (bodyA == bodyUsed[j])
+						{
+							aUnavailable=1;
+							break;
+						}
+					}
+				}
+				if (!aUnavailable)
+				if (!bIsStatic)
+				{
+					for (int j=0;j<curBodyUsed;j++)
+					{
+						if (bodyB == bodyUsed[j])
+						{
+							bUnavailable=1;
+							break;
+						}
+					}
+				}
+                
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+					{
+						bodyUsed[curBodyUsed++] = bodyA;
+					}
+					if (!bIsStatic)
+					{
+						bodyUsed[curBodyUsed++] = bodyB;
+					}
+
+					cs[idx].getBatchIdx() = batchIdx;
+					m_data->m_sortData[idx].m_key = batchIdx;
+					m_data->m_sortData[idx].m_value = idx;
+
+					if (i!=numValidConstraints)
+					{
+						btSwap(idxSrc[i], idxSrc[numValidConstraints]);
+					}
+
+					numValidConstraints++;
+					{
+						nCurrentBatch++;
+						if( nCurrentBatch == simdWidth )
+						{
+							nCurrentBatch = 0;
+							for(int i=0; i<curBodyUsed; i++) 
+								bodyUsed[i] = 0;
+
+							
+							curBodyUsed = 0;
+						}
+					}
+				}
+			}
+			
+			batchIdx ++;
+		}
+	}
+	{
+		BT_PROFILE("quickSort");
+		//m_data->m_sortData.quickSort(sortfnc);
+	}
+
+	{
+        BT_PROFILE("reorder");
+		//	reorder
+		
+		memcpy( &m_data->m_old[0], cs, sizeof(btContact4)*numConstraints);
+
+		for(int i=0; i<numConstraints; i++)
+		{
+			btAssert(m_data->m_sortData[idxSrc[i]].m_value == idxSrc[i]);
+			int idx = m_data->m_sortData[idxSrc[i]].m_value;
+			cs[i] = m_data->m_old[idx];
+		}
+	}
+	
+#if defined(_DEBUG)
+    //		debugPrintf( "nBatches: %d\n", batchIdx );
+	for(int i=0; i<numConstraints; i++)
+    {
+        btAssert( cs[i].getBatchIdx() != -1 );
+    }
+#endif
+
+	
+	return batchIdx;
+}
+
+
+inline int btGpuBatchingPgsSolver::sortConstraintByBatch3( btContact4* cs, int numConstraints, int simdWidth , int staticIdx, int numBodies)
+{
+	
+	BT_PROFILE("sortConstraintByBatch");
+	
+	static int maxSwaps = 0;
+	int numSwaps = 0;
+
+	static int maxNumConstraints = 0;
+	if (maxNumConstraints<numConstraints)
+	{
+		maxNumConstraints = numConstraints;
+		printf("maxNumConstraints  = %d\n",maxNumConstraints );
+	}
+
+	bodyUsed.resize(2*simdWidth);
+
+	for (int q=0;q<2*simdWidth;q++)
+		bodyUsed[q]=0;
+
+	int curBodyUsed = 0;
+
+	int numIter = 0;
+    
+	m_data->m_sortData.resize(0);
+	m_data->m_idxBuffer.resize(0);
+	m_data->m_old.resize(0);
+	
+		
+#if defined(_DEBUG)
+	for(int i=0; i<numConstraints; i++)
+		cs[i].getBatchIdx() = -1;
+#endif
+	
+	int numValidConstraints = 0;
+	int unprocessedConstraintIndex = 0;
+
+	int batchIdx = 0;
+    
+
+	{
+		BT_PROFILE("cpu batch innerloop");
+		
+		while( numValidConstraints < numConstraints)
+		{
+			numIter++;
+			int nCurrentBatch = 0;
+			//	clear flag
+			for(int i=0; i<curBodyUsed; i++) 
+				bodyUsed[i] = 0;
+            curBodyUsed = 0;
+
+			for(int i=numValidConstraints; i<numConstraints; i++)
+			{
+				int idx = i;
+				btAssert( idx < numConstraints );
+				//	check if it can go
+				int bodyAS = cs[idx].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[idx].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+				int aUnavailable = 0;
+				int bUnavailable = 0;
+				if (!aIsStatic)
+				{
+					for (int j=0;j<curBodyUsed;j++)
+					{
+						if (bodyA == bodyUsed[j])
+						{
+							aUnavailable=1;
+							break;
+						}
+					}
+				}
+				if (!aUnavailable)
+				if (!bIsStatic)
+				{
+					for (int j=0;j<curBodyUsed;j++)
+					{
+						if (bodyB == bodyUsed[j])
+						{
+							bUnavailable=1;
+							break;
+						}
+					}
+				}
+                
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+					{
+						bodyUsed[curBodyUsed++] = bodyA;
+					}
+					if (!bIsStatic)
+					{
+						bodyUsed[curBodyUsed++] = bodyB;
+					}
+
+					cs[idx].getBatchIdx() = batchIdx;
+
+					if (i!=numValidConstraints)
+					{
+						btSwap(cs[i],cs[numValidConstraints]);
+						numSwaps++;
+					}
+
+					numValidConstraints++;
+					{
+						nCurrentBatch++;
+						if( nCurrentBatch == simdWidth )
+						{
+							nCurrentBatch = 0;
+							for(int i=0; i<curBodyUsed; i++) 
+								bodyUsed[i] = 0;
+							curBodyUsed = 0;
+						}
+					}
+				}
+			}
+			batchIdx ++;
+		}
+	}
+	
+#if defined(_DEBUG)
+    //		debugPrintf( "nBatches: %d\n", batchIdx );
+	for(int i=0; i<numConstraints; i++)
+    {
+        btAssert( cs[i].getBatchIdx() != -1 );
+    }
+#endif
+
+	if (maxSwaps<numSwaps)
+	{
+		maxSwaps = numSwaps;
+		printf("maxSwaps = %d\n", maxSwaps);
+	}
+	
+	return batchIdx;
+}
--- a/opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.h
+++ b/opencl/gpu_rigidbody/host/btGpuBatchingPgsSolver.h
@@ -12,10 +12,18 @@ class btGpuBatchingPgsSolver
 {
 protected:

+	
+
 	struct btGpuBatchingPgsSolverInternalData*		m_data;

 	void batchContacts( btOpenCLArray<btContact4>* contacts, int nContacts, btOpenCLArray<unsigned int>* n, btOpenCLArray<unsigned int>* offsets, int staticIdx );
+	
 	inline int sortConstraintByBatch( btContact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
+	inline int sortConstraintByBatch2( btContact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
+	inline int sortConstraintByBatch3( btContact4* cs, int n, int simdWidth , int staticIdx, int numBodies);
+	
+
+
 	void solveContactConstraint(  const btOpenCLArray<btRigidBodyCL>* bodyBuf, const btOpenCLArray<btInertiaCL>* shapeBuf, 
 			btOpenCLArray<btGpuConstraint4>* constraint, void* additionalData, int n ,int maxNumBatches, int numIterations);

--- a/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
+++ b/opencl/gpu_rigidbody/host/btGpuRigidBodyPipeline.cpp
@@ -63,10 +63,15 @@ void	btGpuRigidBodyPipeline::stepSimulation(float deltaTime)
 {

 	//update worldspace AABBs from local AABB/worldtransform
-	setupGpuAabbsFull();
+	{
+		setupGpuAabbsFull();
+	}

 	//compute overlapping pairs
-	m_data->m_broadphaseSap->calculateOverlappingPairs();
+	{
+		//m_data->m_broadphaseSap->calculateOverlappingPairsHost();
+		m_data->m_broadphaseSap->calculateOverlappingPairs();
+	}

 	//compute contact points
 	
--- a/opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl
+++ b/opencl/gpu_rigidbody/kernels/batchingKernelsNew.cl
@@ -0,0 +1,236 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Erwin Coumans
+
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
+
+#ifdef cl_ext_atomic_counters_32
+#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable
+#else
+#define counter32_t volatile __global int*
+#endif
+
+#define SIMD_WIDTH 64
+
+typedef unsigned int u32;
+typedef unsigned short u16;
+typedef unsigned char u8;
+
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GET_NUM_GROUPS get_num_groups(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+#define AppendInc(x, out) out = atomic_inc(x)
+#define AtomAdd(x, value) atom_add(&(x), value)
+#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )
+#define AtomXhg(x, value) atom_xchg ( &(x), value )
+
+
+#define SELECT_UINT4( b, a, condition ) select( b,a,condition )
+
+#define make_float4 (float4)
+#define make_float2 (float2)
+#define make_uint4 (uint4)
+#define make_int4 (int4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+
+#define max2 max
+#define min2 min
+
+
+#define WG_SIZE 64
+
+
+
+typedef struct 
+{
+	float4 m_worldPos[4];
+	float4 m_worldNormal;
+	u32 m_coeffs;
+	int m_batchIdx;
+
+	int m_bodyAPtrAndSignBit;//sign bit set for fixed objects
+	int m_bodyBPtrAndSignBit;
+}Contact4;
+
+typedef struct 
+{
+	int m_n;
+	int m_start;
+	int m_staticIdx;
+	int m_paddings[1];
+} ConstBuffer;
+
+typedef struct 
+{
+	int m_a;
+	int m_b;
+	u32 m_idx;
+}Elem;
+
+
+
+
+
+//	batching on the GPU
+__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )
+{
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	
+	const int m_n = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+		
+	if( lIdx == 0 )
+	{
+		for (int i=0;i<m_n;i++)
+		{
+			int srcIdx = i+m_start;
+			int batchIndex = i;
+			gConstraints[ srcIdx ].m_batchIdx = batchIndex;	
+		}
+	}
+}
+
+
+#define CHECK_SIZE (WG_SIZE)
+
+
+
+
+u32 readBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	return buff[bufIdx] & (1<<bitIdx);
+}
+
+void writeBuf(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	buff[bufIdx] |= (1<<bitIdx);
+	//atom_or( &buff[bufIdx], (1<<bitIdx) );
+}
+
+u32 tryWrite(__local u32* buff, int idx)
+{
+	idx = idx % (32*CHECK_SIZE);
+	int bitIdx = idx%32;
+	int bufIdx = idx/32;
+	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );
+	return ((ans >> bitIdx)&1) == 0;
+}
+
+
+//	batching on the GPU
+__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )
+{
+	int wgIdx = GET_GROUP_IDX;
+	int lIdx = GET_LOCAL_IDX;
+	const int numConstraints = gN[wgIdx];
+	const int m_start = gStart[wgIdx];
+		
+	
+	__local u32 ldsFixedBuffer[CHECK_SIZE];
+		
+	
+	
+	
+	
+	if( lIdx == 0 )
+	{
+	
+		
+		__global Contact4* cs = &gConstraints[m_start];	
+	
+		
+		int numValidConstraints = 0;
+		int batchIdx = 0;
+
+		while( numValidConstraints < numConstraints)
+		{
+			int nCurrentBatch = 0;
+			//	clear flag
+	
+			for(int i=0; i<CHECK_SIZE; i++) 
+				ldsFixedBuffer[i] = 0;		
+
+			for(int i=numValidConstraints; i<numConstraints; i++)
+			{
+
+				int bodyAS = cs[i].m_bodyAPtrAndSignBit;
+				int bodyBS = cs[i].m_bodyBPtrAndSignBit;
+				int bodyA = abs(bodyAS);
+				int bodyB = abs(bodyBS);
+				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;
+				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;
+				int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);
+				int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);
+				
+				if( aUnavailable==0 && bUnavailable==0 ) // ok
+				{
+					if (!aIsStatic)
+					{
+						writeBuf( ldsFixedBuffer, bodyA );
+					}
+					if (!bIsStatic)
+					{
+						writeBuf( ldsFixedBuffer, bodyB );
+					}
+
+					cs[i].m_batchIdx = batchIdx;
+
+					if (i!=numValidConstraints)
+					{
+						//btSwap(cs[i],cs[numValidConstraints]);
+						
+						Contact4 tmp = cs[i];
+						cs[i] = cs[numValidConstraints];
+						cs[numValidConstraints] = tmp;
+						
+					}
+
+					numValidConstraints++;
+					
+					nCurrentBatch++;
+					if( nCurrentBatch == SIMD_WIDTH)
+					{
+						nCurrentBatch = 0;
+						for(int i=0; i<CHECK_SIZE; i++) 
+							ldsFixedBuffer[i] = 0;
+						
+					}
+				}
+			}//for
+			batchIdx ++;
+		}//while
+	}//if( lIdx == 0 )
+	
+	//return batchIdx;
+}
--- a/opencl/gpu_rigidbody/kernels/batchingKernelsNew.h
+++ b/opencl/gpu_rigidbody/kernels/batchingKernelsNew.h
@@ -0,0 +1,240 @@
+//this file is autogenerated using stringify.bat (premake --stringify) in the build folder of this project
+static const char* batchingKernelsNewCL= \
+"/*\n"
+"Copyright (c) 2012 Advanced Micro Devices, Inc.  \n"
+"\n"
+"This software is provided 'as-is', without any express or implied warranty.\n"
+"In no event will the authors be held liable for any damages arising from the use of this software.\n"
+"Permission is granted to anyone to use this software for any purpose, \n"
+"including commercial applications, and to alter it and redistribute it freely, \n"
+"subject to the following restrictions:\n"
+"\n"
+"1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.\n"
+"2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.\n"
+"3. This notice may not be removed or altered from any source distribution.\n"
+"*/\n"
+"//Originally written by Erwin Coumans\n"
+"\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable\n"
+"\n"
+"#ifdef cl_ext_atomic_counters_32\n"
+"#pragma OPENCL EXTENSION cl_ext_atomic_counters_32 : enable\n"
+"#else\n"
+"#define counter32_t volatile __global int*\n"
+"#endif\n"
+"\n"
+"#define SIMD_WIDTH 64\n"
+"\n"
+"typedef unsigned int u32;\n"
+"typedef unsigned short u16;\n"
+"typedef unsigned char u8;\n"
+"\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GET_NUM_GROUPS get_num_groups(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"#define AppendInc(x, out) out = atomic_inc(x)\n"
+"#define AtomAdd(x, value) atom_add(&(x), value)\n"
+"#define AtomCmpxhg(x, cmp, value) atom_cmpxchg( &(x), cmp, value )\n"
+"#define AtomXhg(x, value) atom_xchg ( &(x), value )\n"
+"\n"
+"\n"
+"#define SELECT_UINT4( b, a, condition ) select( b,a,condition )\n"
+"\n"
+"#define make_float4 (float4)\n"
+"#define make_float2 (float2)\n"
+"#define make_uint4 (uint4)\n"
+"#define make_int4 (int4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"\n"
+"#define max2 max\n"
+"#define min2 min\n"
+"\n"
+"\n"
+"#define WG_SIZE 64\n"
+"\n"
+"\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	float4 m_worldPos[4];\n"
+"	float4 m_worldNormal;\n"
+"	u32 m_coeffs;\n"
+"	int m_batchIdx;\n"
+"\n"
+"	int m_bodyAPtrAndSignBit;//sign bit set for fixed objects\n"
+"	int m_bodyBPtrAndSignBit;\n"
+"}Contact4;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_n;\n"
+"	int m_start;\n"
+"	int m_staticIdx;\n"
+"	int m_paddings[1];\n"
+"} ConstBuffer;\n"
+"\n"
+"typedef struct \n"
+"{\n"
+"	int m_a;\n"
+"	int m_b;\n"
+"	u32 m_idx;\n"
+"}Elem;\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatchesBruteForce( __global Contact4* gConstraints, 	__global const u32* gN, __global const u32* gStart, int m_staticIdx )\n"
+"{\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	\n"
+"	const int m_n = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"		\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"		for (int i=0;i<m_n;i++)\n"
+"		{\n"
+"			int srcIdx = i+m_start;\n"
+"			int batchIndex = i;\n"
+"			gConstraints[ srcIdx ].m_batchIdx = batchIndex;	\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"#define CHECK_SIZE (WG_SIZE)\n"
+"\n"
+"\n"
+"\n"
+"\n"
+"u32 readBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	return buff[bufIdx] & (1<<bitIdx);\n"
+"}\n"
+"\n"
+"void writeBuf(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	buff[bufIdx] |= (1<<bitIdx);\n"
+"	//atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"}\n"
+"\n"
+"u32 tryWrite(__local u32* buff, int idx)\n"
+"{\n"
+"	idx = idx % (32*CHECK_SIZE);\n"
+"	int bitIdx = idx%32;\n"
+"	int bufIdx = idx/32;\n"
+"	u32 ans = (u32)atom_or( &buff[bufIdx], (1<<bitIdx) );\n"
+"	return ((ans >> bitIdx)&1) == 0;\n"
+"}\n"
+"\n"
+"\n"
+"//	batching on the GPU\n"
+"__kernel void CreateBatchesNew( __global Contact4* gConstraints, __global const u32* gN, __global const u32* gStart, int staticIdx )\n"
+"{\n"
+"	int wgIdx = GET_GROUP_IDX;\n"
+"	int lIdx = GET_LOCAL_IDX;\n"
+"	const int numConstraints = gN[wgIdx];\n"
+"	const int m_start = gStart[wgIdx];\n"
+"		\n"
+"	\n"
+"	__local u32 ldsFixedBuffer[CHECK_SIZE];\n"
+"		\n"
+"	\n"
+"	\n"
+"	\n"
+"	\n"
+"	if( lIdx == 0 )\n"
+"	{\n"
+"	\n"
+"		\n"
+"		__global Contact4* cs = &gConstraints[m_start];	\n"
+"	\n"
+"		\n"
+"		int numValidConstraints = 0;\n"
+"		int batchIdx = 0;\n"
+"\n"
+"		while( numValidConstraints < numConstraints)\n"
+"		{\n"
+"			int nCurrentBatch = 0;\n"
+"			//	clear flag\n"
+"	\n"
+"			for(int i=0; i<CHECK_SIZE; i++) \n"
+"				ldsFixedBuffer[i] = 0;		\n"
+"\n"
+"			for(int i=numValidConstraints; i<numConstraints; i++)\n"
+"			{\n"
+"\n"
+"				int bodyAS = cs[i].m_bodyAPtrAndSignBit;\n"
+"				int bodyBS = cs[i].m_bodyBPtrAndSignBit;\n"
+"				int bodyA = abs(bodyAS);\n"
+"				int bodyB = abs(bodyBS);\n"
+"				bool aIsStatic = (bodyAS<0) || bodyAS==staticIdx;\n"
+"				bool bIsStatic = (bodyBS<0) || bodyBS==staticIdx;\n"
+"				int aUnavailable = aIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyA);\n"
+"				int bUnavailable = bIsStatic ? 0 : readBuf( ldsFixedBuffer, bodyB);\n"
+"				\n"
+"				if( aUnavailable==0 && bUnavailable==0 ) // ok\n"
+"				{\n"
+"					if (!aIsStatic)\n"
+"					{\n"
+"						writeBuf( ldsFixedBuffer, bodyA );\n"
+"					}\n"
+"					if (!bIsStatic)\n"
+"					{\n"
+"						writeBuf( ldsFixedBuffer, bodyB );\n"
+"					}\n"
+"\n"
+"					cs[i].m_batchIdx = batchIdx;\n"
+"\n"
+"					if (i!=numValidConstraints)\n"
+"					{\n"
+"						//btSwap(cs[i],cs[numValidConstraints]);\n"
+"						\n"
+"						Contact4 tmp = cs[i];\n"
+"						cs[i] = cs[numValidConstraints];\n"
+"						cs[numValidConstraints] = tmp;\n"
+"						\n"
+"					}\n"
+"\n"
+"					numValidConstraints++;\n"
+"					\n"
+"					nCurrentBatch++;\n"
+"					if( nCurrentBatch == SIMD_WIDTH)\n"
+"					{\n"
+"						nCurrentBatch = 0;\n"
+"						for(int i=0; i<CHECK_SIZE; i++) \n"
+"							ldsFixedBuffer[i] = 0;\n"
+"						\n"
+"					}\n"
+"				}\n"
+"			}//for\n"
+"			batchIdx ++;\n"
+"		}//while\n"
+"	}//if( lIdx == 0 )\n"
+"	\n"
+"	//return batchIdx;\n"
+"}\n"
+"\n"
+;