Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
+++ b/Extras/RigidBodyGpuPipeline/dynamics/basic_demo/Stubs/Solver.inl
@@ -0,0 +1,762 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#define PATH "..\\..\\dynamics\\basic_demo\\Stubs\\SolverKernels"
+#define BATCHING_PATH "..\\..\\dynamics\\basic_demo\\Stubs\\batchingKernels"
+
+#define KERNEL1 "SingleBatchSolveKernel"
+#define KERNEL2 "BatchSolveKernel"
+
+#define KERNEL3 "ContactToConstraintKernel"
+#define KERNEL4 "SetSortDataKernel"
+#define KERNEL5 "ReorderContactKernel"
+#include "SolverKernels.h"
+
+#include "batchingKernels.h"
+
+
+struct SolverDebugInfo
+{
+	int m_valInt0;
+	int m_valInt1;
+	int m_valInt2;
+	int m_valInt3;
+	
+	int m_valInt4;
+	int m_valInt5;
+	int m_valInt6;
+	int m_valInt7;
+
+	int m_valInt8;
+	int m_valInt9;
+	int m_valInt10;
+	int m_valInt11;
+
+	int	m_valInt12;
+	int	m_valInt13;
+	int	m_valInt14;
+	int	m_valInt15;
+
+
+	float m_val0;
+	float m_val1;
+	float m_val2;
+	float m_val3;
+};
+
+
+
+
+class SolverDeviceInl
+{
+public:
+	struct ParallelSolveData
+	{
+		Buffer<u32>* m_numConstraints;
+		Buffer<u32>* m_offsets;
+	};
+};
+
+template<DeviceType TYPE>
+typename Solver<TYPE>::Data* Solver<TYPE>::allocate( const Device* device, int pairCapacity )
+{
+		const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{solverKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+		const char* src2[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{batchingKernelsCL, 0};
+#else
+		{0,0};
+#endif
+
+
+	
+
+	Data* data = new Data;
+	data->m_device = device;
+	bool cacheBatchingKernel = true;
+	data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", src2[TYPE],cacheBatchingKernel);
+	//data->m_batchingKernel = device->getKernel( BATCHING_PATH, "CreateBatches",  "-I ..\\..\\ ", 0,cacheBatchingKernel);
+	bool cacheSolverKernel  = true;
+
+	data->m_batchSolveKernel = device->getKernel( PATH, KERNEL2, "-I ..\\..\\ ", src[TYPE],cacheSolverKernel );
+	data->m_contactToConstraintKernel = device->getKernel( PATH, KERNEL3, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_setSortDataKernel = device->getKernel( PATH, KERNEL4, 
+		"-I ..\\..\\ ", src[TYPE] );
+	data->m_reorderContactKernel = device->getKernel( PATH, KERNEL5, 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_copyConstraintKernel = device->getKernel( PATH, "CopyConstraintKernel", 
+		"-I ..\\..\\ ", src[TYPE] );
+
+	data->m_parallelSolveData = new SolverDeviceInl::ParallelSolveData;
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		solveData->m_numConstraints = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+		solveData->m_offsets = new Buffer<u32>( device, N_SPLIT*N_SPLIT );
+	}
+	const int sortSize = NEXTMULTIPLEOF( pairCapacity, 512 );
+
+
+	//data->m_sort = RadixSort<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	data->m_sort32 = RadixSort32<TYPE>::allocate( data->m_device, sortSize );//todo. remove hardcode this
+	
+	data->m_search = BoundSearch<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+	data->m_scan = PrefixScan<TYPE>::allocate( data->m_device, N_SPLIT*N_SPLIT );
+
+	data->m_sortDataBuffer = new Buffer<SortData>( data->m_device, sortSize );
+
+	if( pairCapacity < DYNAMIC_CONTACT_ALLOCATION_THRESHOLD )
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, pairCapacity );
+	else
+		data->m_contactBuffer = 0;
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::deallocate( Data* data )
+{
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		delete solveData->m_numConstraints;
+		delete solveData->m_offsets;
+		delete solveData;
+	}
+
+//	RadixSort<TYPE>::deallocate( data->m_sort );
+	RadixSort32<TYPE>::deallocate(data->m_sort32);
+	BoundSearch<TYPE>::deallocate( data->m_search );
+	PrefixScan<TYPE>::deallocate( data->m_scan );
+
+	delete data->m_sortDataBuffer;
+	if( data->m_contactBuffer ) delete data->m_contactBuffer;
+
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::reorderConvertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf,
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	if( data->m_contactBuffer )
+	{
+		if( data->m_contactBuffer->getSize() < nContacts )
+		{
+			BT_PROFILE("delete data->m_contactBuffer;");
+			delete data->m_contactBuffer;
+			data->m_contactBuffer = 0;
+		}
+	}
+	if( data->m_contactBuffer == 0 )
+	{
+		BT_PROFILE("new data->m_contactBuffer;");
+
+		data->m_contactBuffer = new Buffer<Contact4>( data->m_device, nContacts );
+	}
+	Stopwatch sw;
+
+	Buffer<Contact4>* contactNative = BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn, nContacts );
+
+	//DeviceUtils::Config dhCfg;
+	//Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		DeviceUtils::waitForCompletion( data->m_device );
+		sw.start();
+		//	contactsIn -> data->m_contactBuffer
+		{
+			BT_PROFILE("sortContacts");
+			Solver<TYPE>::sortContacts( data, bodyBuf, contactNative, additionalData, nContacts, cfg );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		sw.split();
+		if(0)
+		{
+			Contact4* tmp = new Contact4[nContacts];
+			data->m_contactBuffer->read( tmp, nContacts );
+			DeviceUtils::waitForCompletion( data->m_contactBuffer->m_device );
+			contactNative->write( tmp, nContacts );
+			DeviceUtils::waitForCompletion( contactNative->m_device );
+			delete [] tmp;
+		}
+		else
+		{
+			BT_PROFILE("m_copyConstraintKernel");
+
+			Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+			int4 cdata; cdata.x = nContacts;
+			BufferInfo bInfo[] = { BufferInfo( data->m_contactBuffer ), BufferInfo( contactNative ) };
+//			Launcher launcher( data->m_device, data->m_device->getKernel( PATH, "CopyConstraintKernel",  "-I ..\\..\\ -Wf,--c++", 0 ) );
+			Launcher launcher( data->m_device, data->m_copyConstraintKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( nContacts, 64 );
+			DeviceUtils::waitForCompletion( data->m_device );
+		}
+		{
+			BT_PROFILE("batchContacts");
+			Solver<TYPE>::batchContacts( data, contactNative, nContacts, nativeSolveData->m_numConstraints, nativeSolveData->m_offsets, cfg.m_staticIdx );
+
+		}
+	}
+	{
+			BT_PROFILE("waitForCompletion (batchContacts)");
+			DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.split();
+	//================
+	if(0)
+	{
+//		Solver<TYPE_HOST>::Data* solverHost = Solver<TYPE_HOST>::allocate( deviceHost, nContacts );
+//		Solver<TYPE_HOST>::convertToConstraints( solverHost, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+//		Solver<TYPE_HOST>::deallocate( solverHost );
+	}
+	else
+	{
+		BT_PROFILE("convertToConstraints");
+		Solver<TYPE>::convertToConstraints( data, bodyBuf, shapeBuf, contactNative, contactCOut, additionalData, nContacts, cfg );
+	}
+	{
+		BT_PROFILE("convertToConstraints waitForCompletion");
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+	sw.stop();
+
+	{
+		BT_PROFILE("printf");
+
+		float t[5];
+		sw.getMs( t, 3 );
+//		printf("%3.2f, %3.2f, %3.2f, ", t[0], t[1], t[2]);
+	}
+
+	{
+		BT_PROFILE("deallocate and unmap");
+
+		//DeviceUtils::deallocate( deviceHost );
+
+		BufferUtils::unmap<true>( contactNative, contactsIn, nContacts );
+	}
+}
+
+
+template<DeviceType TYPE>
+void Solver<TYPE>::solveContactConstraint( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+			SolverData constraint, void* additionalData, int n )
+{
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::solveContactConstraint( hostData, bodyBuf, shapeBuf, constraint, additionalData, n );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	ADLASSERT( data );
+
+	Buffer<Constraint4>* cBuffer =0;
+	
+	Buffer<RigidBodyBase::Body>* gBodyNative=0; 
+	Buffer<RigidBodyBase::Inertia>* gShapeNative =0;
+	Buffer<Constraint4>* gConstraintNative =0;
+	
+
+	{
+		BT_PROFILE("map");
+	cBuffer = (Buffer<Constraint4>*)constraint;
+
+		gBodyNative= BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		gShapeNative= BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		gConstraintNative = BufferUtils::map<TYPE, true>( data->m_device, cBuffer );
+		DeviceUtils::waitForCompletion( data->m_device );
+	}
+
+	Buffer<int4> constBuffer;
+	int4 cdata = make_int4( n, 0, 0, 0 );
+	{
+		SolverDeviceInl::ParallelSolveData* solveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+		const int nn = N_SPLIT*N_SPLIT;
+
+		cdata.x = 0;
+		cdata.y = 250;
+
+#if 0
+//check how the cells are filled
+		unsigned int* hostCounts = new unsigned int[N_SPLIT*N_SPLIT];
+		solveData->m_numConstraints->read(hostCounts,N_SPLIT*N_SPLIT);
+		DeviceUtils::waitForCompletion( data->m_device );
+		for (int i=0;i<N_SPLIT*N_SPLIT;i++)
+		{
+			if (hostCounts[i])
+			{
+				printf("hostCounts[%d]=%d\n",i,hostCounts[i]);
+			}
+		}
+		delete[] hostCounts;
+#endif
+
+		int numWorkItems = 64*nn/N_BATCHES;
+#ifdef DEBUG_ME
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+#endif
+
+
+
+		{
+
+			BT_PROFILE("m_batchSolveKernel iterations");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+#ifdef DEBUG_ME
+					memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+					gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+				
+
+					BufferInfo bInfo[] = { 
+
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets ) 
+#ifdef DEBUG_ME
+						,	BufferInfo(&gpuDebugInfo)
+#endif
+						};
+
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					
+					launcher.launch1D( numWorkItems, 64 );
+
+#ifdef DEBUG_ME
+					DeviceUtils::waitForCompletion( data->m_device );
+					gpuDebugInfo.read(debugInfo,numWorkItems);
+					DeviceUtils::waitForCompletion( data->m_device );
+					for (int i=0;i<numWorkItems;i++)
+					{
+						if (debugInfo[i].m_valInt2>0)
+						{
+							printf("debugInfo[i].m_valInt2 = %d\n",i,debugInfo[i].m_valInt2);
+						}
+
+						if (debugInfo[i].m_valInt3>0)
+						{
+							printf("debugInfo[i].m_valInt3 = %d\n",i,debugInfo[i].m_valInt3);
+						}
+					}
+#endif //DEBUG_ME
+
+
+				}
+			}
+		
+			DeviceUtils::waitForCompletion( data->m_device );
+
+
+		}
+
+		cdata.x = 1;
+		{
+			BT_PROFILE("m_batchSolveKernel iterations2");
+			for(int iter=0; iter<data->m_nIterations; iter++)
+			{
+				for(int ib=0; ib<N_BATCHES; ib++)
+				{
+					cdata.z = ib;
+					cdata.w = N_SPLIT;
+
+					BufferInfo bInfo[] = { 
+						BufferInfo( gBodyNative ), 
+						BufferInfo( gShapeNative ), 
+						BufferInfo( gConstraintNative ),
+						BufferInfo( solveData->m_numConstraints ), 
+						BufferInfo( solveData->m_offsets )
+#ifdef DEBUG_ME
+						,BufferInfo(&gpuDebugInfo)
+#endif //DEBUG_ME
+					};
+					Launcher launcher( data->m_device, data->m_batchSolveKernel );
+					launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+					launcher.setConst( constBuffer, cdata );
+					launcher.launch1D( 64*nn/N_BATCHES, 64 );
+				}
+			}
+			DeviceUtils::waitForCompletion( data->m_device );
+			
+		}
+#ifdef DEBUG_ME
+		delete[] debugInfo;
+#endif //DEBUG_ME
+	}
+
+	{
+		BT_PROFILE("unmap");
+	BufferUtils::unmap<true>( gBodyNative, bodyBuf );
+	BufferUtils::unmap<false>( gShapeNative, shapeBuf );
+	BufferUtils::unmap<true>( gConstraintNative, cBuffer );
+	DeviceUtils::waitForCompletion( data->m_device );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::convertToConstraints( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+	const Buffer<RigidBodyBase::Inertia>* shapeBuf, 
+	Buffer<Contact4>* contactsIn, SolverData contactCOut, void* additionalData, 
+	int nContacts, const ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	Buffer<RigidBodyBase::Body>* bodyNative =0;
+	Buffer<RigidBodyBase::Inertia>* shapeNative =0;
+	Buffer<Contact4>* contactNative =0;
+	Buffer<Constraint4>* constraintNative =0;
+
+	{
+		BT_PROFILE("map buffers");
+
+		bodyNative = BufferUtils::map<TYPE, true>( data->m_device, bodyBuf );
+		shapeNative  = BufferUtils::map<TYPE, true>( data->m_device, shapeBuf );
+		contactNative= BufferUtils::map<TYPE, true>( data->m_device, contactsIn );
+		constraintNative = BufferUtils::map<TYPE, false>( data->m_device, (Buffer<Constraint4>*)contactCOut );
+	}
+	struct CB
+	{
+		int m_nContacts;
+		float m_dt;
+		float m_positionDrift;
+		float m_positionConstraintCoeff;
+	};
+
+	{
+		BT_PROFILE("m_contactToConstraintKernel");
+		CB cdata;
+		cdata.m_nContacts = nContacts;
+		cdata.m_dt = cfg.m_dt;
+		cdata.m_positionDrift = cfg.m_positionDrift;
+		cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
+
+		Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( shapeNative ),
+			BufferInfo( constraintNative )};
+		Launcher launcher( data->m_device, data->m_contactToConstraintKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( nContacts, 64 );	
+		DeviceUtils::waitForCompletion( data->m_device );
+
+	}
+
+	{
+		BT_PROFILE("unmap");
+		BufferUtils::unmap<false>( bodyNative, bodyBuf );
+		BufferUtils::unmap<false>( shapeNative, shapeBuf );
+		BufferUtils::unmap<false>( contactNative, contactsIn );
+		BufferUtils::unmap<true>( constraintNative, (Buffer<Constraint4>*)contactCOut );
+	}
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::sortContacts( typename Solver<TYPE>::Data* data, const Buffer<RigidBodyBase::Body>* bodyBuf, 
+			Buffer<Contact4>* contactsIn, void* additionalData, 
+			int nContacts, const typename Solver<TYPE>::ConstraintCfg& cfg )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+	Buffer<RigidBodyBase::Body>* bodyNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, bodyBuf );
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contactsIn );
+
+	const int sortAlignment = 512; // todo. get this out of sort
+	if( cfg.m_enableParallelSolve )
+	{
+		SolverDeviceInl::ParallelSolveData* nativeSolveData = (SolverDeviceInl::ParallelSolveData*)data->m_parallelSolveData;
+
+		int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
+
+		Buffer<u32>* countsNative = nativeSolveData->m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
+		Buffer<u32>* offsetsNative = nativeSolveData->m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
+
+		{	//	2. set cell idx
+			struct CB
+			{
+				int m_nContacts;
+				int m_staticIdx;
+				float m_scale;
+				int m_nSplit;
+			};
+
+			ADLASSERT( sortSize%64 == 0 );
+			CB cdata;
+			cdata.m_nContacts = nContacts;
+			cdata.m_staticIdx = cfg.m_staticIdx;
+			cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
+			cdata.m_nSplit = N_SPLIT;
+
+			Buffer<CB> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+			BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( bodyNative ), BufferInfo( data->m_sortDataBuffer ) };
+			Launcher launcher( data->m_device, data->m_setSortDataKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( constBuffer, cdata );
+			launcher.launch1D( sortSize, 64 );
+		}
+
+		{	//	3. sort by cell idx
+			int n = N_SPLIT*N_SPLIT;
+			int sortBit = 32;
+			//if( n <= 0xffff ) sortBit = 16;
+			//if( n <= 0xff ) sortBit = 8;
+			RadixSort32<TYPE>::execute( data->m_sort32, *data->m_sortDataBuffer,sortSize);
+		}
+		{	//	4. find entries
+			BoundSearch<TYPE>::execute( data->m_search, *data->m_sortDataBuffer, nContacts, *countsNative, N_SPLIT*N_SPLIT, BoundSearchBase::COUNT );
+
+			PrefixScan<TYPE>::execute( data->m_scan, *countsNative, *offsetsNative, N_SPLIT*N_SPLIT );
+		}
+
+		{	//	5. sort constraints by cellIdx
+			//	todo. preallocate this
+//			ADLASSERT( contactsIn->getType() == TYPE_HOST );
+//			Buffer<Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn );	//	copying contacts to this buffer
+
+			{
+				Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+
+				int4 cdata; cdata.x = nContacts;
+				BufferInfo bInfo[] = { BufferInfo( contactNative ), BufferInfo( data->m_contactBuffer ), BufferInfo( data->m_sortDataBuffer ) };
+				Launcher launcher( data->m_device, data->m_reorderContactKernel );
+				launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+				launcher.setConst( constBuffer, cdata );
+				launcher.launch1D( nContacts, 64 );
+			}
+//			BufferUtils::unmap<true>( out, contactsIn, nContacts );
+		}
+	}
+
+	BufferUtils::unmap<false>( bodyNative, bodyBuf );
+	BufferUtils::unmap<false>( contactNative, contactsIn );
+}
+
+template<DeviceType TYPE>
+void Solver<TYPE>::batchContacts( typename Solver<TYPE>::Data* data, Buffer<Contact4>* contacts, int nContacts, Buffer<u32>* n, Buffer<u32>* offsets, int staticIdx )
+{
+	ADLASSERT( data->m_device->m_type == TYPE_CL );
+
+	if(0)
+	{
+		BT_PROFILE("CPU classTestKernel/Kernel (batch generation?)");
+
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			Solver<TYPE_HOST>::Data* hostData = Solver<TYPE_HOST>::allocate( deviceHost, 0 );
+			Solver<TYPE_HOST>::batchContacts( hostData, contacts, nContacts, n, offsets, staticIdx );
+			Solver<TYPE_HOST>::deallocate( hostData );
+		}
+		DeviceUtils::deallocate( deviceHost );
+		return;
+	}
+
+	Buffer<Contact4>* contactNative 
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, contacts, nContacts );
+	Buffer<u32>* nNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, n );
+	Buffer<u32>* offsetsNative
+		= BufferUtils::map<TYPE_CL, true>( data->m_device, offsets );
+
+	{
+		BT_PROFILE("GPU classTestKernel/Kernel (batch generation?)");
+		Buffer<int4> constBuffer( data->m_device, 1, BufferBase::BUFFER_CONST );
+		int4 cdata;
+		cdata.x = nContacts;
+		cdata.y = 0;
+		cdata.z = staticIdx;
+
+		int numWorkItems = 64*N_SPLIT*N_SPLIT;
+#ifdef BATCH_DEBUG
+		SolverDebugInfo* debugInfo = new  SolverDebugInfo[numWorkItems];
+		adl::Buffer<SolverDebugInfo> gpuDebugInfo(data->m_device,numWorkItems);
+		memset(debugInfo,0,sizeof(SolverDebugInfo)*numWorkItems);
+		gpuDebugInfo.write(debugInfo,numWorkItems);
+#endif
+
+
+		BufferInfo bInfo[] = { 
+			BufferInfo( contactNative ), 
+			BufferInfo( data->m_contactBuffer ), 
+			BufferInfo( nNative ), 
+			BufferInfo( offsetsNative ) 
+#ifdef BATCH_DEBUG
+			,	BufferInfo(&gpuDebugInfo)
+#endif
+		};
+
+		
+		
+		Launcher launcher( data->m_device, data->m_batchingKernel);
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( constBuffer, cdata );
+		launcher.launch1D( numWorkItems, 64 );
+		DeviceUtils::waitForCompletion( data->m_device );
+
+#ifdef BATCH_DEBUG
+	aaaa
+		Contact4* hostContacts = new Contact4[nContacts];
+		data->m_contactBuffer->read(hostContacts,nContacts);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		gpuDebugInfo.read(debugInfo,numWorkItems);
+		DeviceUtils::waitForCompletion( data->m_device );
+
+		for (int i=0;i<numWorkItems;i++)
+		{
+			if (debugInfo[i].m_valInt1>0)
+			{
+				printf("catch\n");
+			}
+			if (debugInfo[i].m_valInt2>0)
+			{
+				printf("catch22\n");
+			}
+
+			if (debugInfo[i].m_valInt3>0)
+			{
+				printf("catch666\n");
+			}
+
+			if (debugInfo[i].m_valInt4>0)
+			{
+				printf("catch777\n");
+			}
+		}
+		delete[] debugInfo;
+#endif //BATCH_DEBUG
+
+	}
+
+	if(0)
+	{
+		u32* nhost = new u32[N_SPLIT*N_SPLIT];
+
+		nNative->read( nhost, N_SPLIT*N_SPLIT );
+
+		Contact4* chost = new Contact4[nContacts];
+		data->m_contactBuffer->read( chost, nContacts );
+		DeviceUtils::waitForCompletion( data->m_device );
+		printf(">>");
+		int nonzero = 0;
+		u32 maxn = 0;
+		for(int i=0; i<N_SPLIT*N_SPLIT; i++)
+		{
+			printf("%d-", nhost[i]);
+			nonzero += (nhost[i]==0)? 0:1;
+			maxn = max2( nhost[i], maxn );
+		}
+		printf("\nnonzero:zero = %d:%d (%d)\n", nonzero, N_SPLIT*N_SPLIT-nonzero, maxn);
+		printf("\n\n");
+
+		int prev = 0;
+		int prevIdx = 0;
+		int maxNBatches = 0;
+		for(int i=0; i<nContacts; i++)
+		{
+//			printf("(%d, %d:%d),", chost[i].m_batchIdx, chost[i].m_bodyAPtr, chost[i].m_bodyBPtr);
+			if( prev != 0 && chost[i].m_batchIdx == 0 )
+			{
+				maxNBatches = max2( maxNBatches, prev );
+				printf("\n[%d]", prev);
+
+				//for(int j=prevIdx; j<i; j++)
+				//{
+				//	printf("(%d:%d),", chost[j].m_bodyAPtr, chost[j].m_bodyBPtr);
+				//}
+
+				//printf("\n");
+
+				prevIdx = i;
+			}
+
+			printf("%d,", chost[i].m_batchIdx);
+
+			prev = chost[i].m_batchIdx;
+		}
+		printf("\n");
+		printf("Max: %d\n", maxNBatches);
+
+		delete [] chost;
+		delete [] nhost;
+	}
+//	copy buffer to buffer
+	contactNative->write( *data->m_contactBuffer, nContacts );
+	DeviceUtils::waitForCompletion( data->m_device );
+
+	if(0)
+	{
+		DeviceUtils::Config dhCfg;
+		Device* deviceHost = DeviceUtils::allocate( TYPE_HOST, dhCfg );
+		{
+			HostBuffer<Contact4> host( deviceHost, nContacts );
+			contactNative->read( host.m_ptr, nContacts );
+			DeviceUtils::waitForCompletion( data->m_device );
+
+			for(int i=0; i<nContacts; i++)
+			{
+				ADLASSERT( host[i].m_bodyAPtr <= (u32)staticIdx );
+				ADLASSERT( host[i].m_bodyBPtr <= (u32)staticIdx );
+			}
+		}
+		DeviceUtils::deallocate( deviceHost );
+	}
+
+	BufferUtils::unmap<true>( contactNative, contacts );
+	BufferUtils::unmap<false>( nNative, n );
+	BufferUtils::unmap<false>( offsetsNative, offsets );
+}
+
+#undef PATH
+#undef KERNEL1
+#undef KERNEL2
+
+#undef KERNEL3
+#undef KERNEL4
+#undef KERNEL5