Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+#include <AdlPrimitives/Sort/SortData.h>
+#include <AdlPrimitives/Fill/Fill.h>
+
+namespace adl
+{
+
+class BoundSearchBase
+{
+	public:
+		enum Option
+		{
+			BOUND_LOWER,
+			BOUND_UPPER,
+			COUNT,
+		};
+};
+
+template<DeviceType TYPE>
+class BoundSearch : public BoundSearchBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_lowerSortDataKernel;
+			Kernel* m_upperSortDataKernel;
+			Kernel* m_subtractKernel;
+			Buffer<int4>* m_constBuffer;
+			Buffer<u32>* m_lower;
+			Buffer<u32>* m_upper;
+			typename Fill<TYPE>::Data* m_fillData;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize = 0);
+
+		static
+		void deallocate(Data* data);
+
+		//	src has to be src[i].m_key <= src[i+1].m_key
+		static
+		void execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option = BOUND_LOWER );
+
+//		static
+//		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
+};
+
+#include <AdlPrimitives/Search/BoundSearchHost.inl>
+#include <AdlPrimitives/Search/BoundSearch.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearch.inl
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Search\\BoundSearchKernels"
+#define KERNEL0 "SearchSortDataLowerKernel"
+#define KERNEL1 "SearchSortDataUpperKernel"
+#define KERNEL2 "SubtractKernel"
+
+#include <AdlPrimitives/Search/BoundSearchKernelsCL.h>
+#include <AdlPrimitives/Search/BoundSearchKernelsDX11.h>
+
+template<DeviceType TYPE>
+typename BoundSearch<TYPE>::Data* BoundSearch<TYPE>::allocate(const Device* device, int maxSize)
+{
+	ADLASSERT( TYPE == device->m_type );
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+		{boundSearchKernelsCL, boundSearchKernelsDX11};
+#else
+		{0,0};
+#endif
+
+	Data* data = new Data;
+
+	data->m_device = device;
+	data->m_lowerSortDataKernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_upperSortDataKernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+	if( maxSize )
+	{
+		data->m_subtractKernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	}
+	data->m_lower = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
+	data->m_upper = (maxSize == 0)? 0: new Buffer<u32>( device, maxSize );
+	data->m_fillData = (maxSize == 0)? 0: Fill<TYPE>::allocate( device );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void BoundSearch<TYPE>::deallocate(Data* data)
+{
+	delete data->m_constBuffer;
+	if( data->m_lower ) delete data->m_lower;
+	if( data->m_upper ) delete data->m_upper;
+	if( data->m_fillData ) Fill<TYPE>::deallocate( data->m_fillData );
+	delete data;
+}
+
+template<DeviceType TYPE>
+void BoundSearch<TYPE>::execute(Data* data, Buffer<SortData>& src, u32 nSrc, Buffer<u32>& dst, u32 nDst, Option option )
+{
+	int4 constBuffer;
+	constBuffer.x = nSrc;
+	constBuffer.y = nDst;
+
+	Buffer<SortData>* srcNative = BufferUtils::map<TYPE, true>( data->m_device, &src );
+	Buffer<u32>* dstNative = BufferUtils::map<TYPE, false>( data->m_device, &dst );
+
+	if( option == BOUND_LOWER )
+	{
+		BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
+
+		Launcher launcher( data->m_device, data->m_lowerSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( nSrc, 64 );
+	}
+	else if( option == BOUND_UPPER )
+	{
+		BufferInfo bInfo[] = { BufferInfo( srcNative, true ), BufferInfo( dstNative ) };
+
+		Launcher launcher( data->m_device, data->m_upperSortDataKernel );
+		launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+		launcher.setConst( *data->m_constBuffer, constBuffer );
+		launcher.launch1D( nSrc+1, 64 );
+	}
+	else if( option == COUNT )
+	{
+		ADLASSERT( data->m_lower );
+		ADLASSERT( data->m_upper );
+		ADLASSERT( data->m_lower->getSize() <= (int)nDst );
+		ADLASSERT( data->m_upper->getSize() <= (int)nDst );
+
+		int zero = 0;
+		Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_lower, zero, nDst );
+		Fill<TYPE>::execute( data->m_fillData, (Buffer<int>&)*data->m_upper, zero, nDst );
+
+		execute( data, src, nSrc, *data->m_lower, nDst, BOUND_LOWER );
+		execute( data, src, nSrc, *data->m_upper, nDst, BOUND_UPPER );
+
+		{
+			BufferInfo bInfo[] = { BufferInfo( data->m_upper, true ), BufferInfo( data->m_lower, true ), BufferInfo( dstNative ) };
+
+			Launcher launcher( data->m_device, data->m_subtractKernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( nDst, 64 );
+		}
+	}
+	else
+	{
+		ADLASSERT( 0 );
+	}
+
+	BufferUtils::unmap<false>( srcNative, &src );
+	BufferUtils::unmap<true>( dstNative, &dst );
+}
+
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchHost.inl
@@ -0,0 +1,111 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class BoundSearch<TYPE_HOST> : public BoundSearchBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+		};
+
+		static
+		Data* allocate(const Device* deviceData, int maxSize = 0)
+		{
+			ADLASSERT( deviceData->m_type == TYPE_HOST );
+			Data* data = new Data;
+			data->m_device = deviceData;
+			return data;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			delete data;
+		}
+
+		static
+		void execute(Data* data, Buffer<SortData>& rawSrc, u32 nSrc, Buffer<u32>& rawDst, u32 nDst, Option option = BOUND_LOWER)
+		{
+			ADLASSERT( rawSrc.getType() == TYPE_HOST );
+			ADLASSERT( rawDst.getType() == TYPE_HOST );
+
+			HostBuffer<SortData>& src = *(HostBuffer<SortData>*)&rawSrc;
+			HostBuffer<u32>& dst = *(HostBuffer<u32>*)&rawDst;
+
+			for(int i=0; i<nSrc-1; i++) 
+				ADLASSERT( src[i].m_key <= src[i+1].m_key );
+
+			if( option == BOUND_LOWER )
+			{
+				for(u32 i=0; i<nSrc; i++)
+				{
+					SortData& iData = (i==0)? SortData(-1,-1): src[i-1];
+					SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
+
+					if( iData.m_key != jData.m_key )
+					{
+//						for(u32 k=iData.m_key+1; k<=min(jData.m_key,nDst-1); k++)
+						u32 k = jData.m_key;
+						{
+							dst[k] = i;
+						}
+					}
+				}
+			}
+			else if( option == BOUND_UPPER )
+			{
+				for(u32 i=0; i<nSrc+1; i++)
+				{
+					SortData& iData = (i==0)? SortData(0,0): src[i-1];
+					SortData& jData = (i==nSrc)? SortData(nDst, nDst): src[i];
+
+					if( iData.m_key != jData.m_key )
+					{
+//						for(u32 k=iData.m_key; k<min(jData.m_key,nDst); k++)
+						u32 k = iData.m_key;
+						{
+							dst[k] = i;
+						}
+					}
+				}
+			}
+			else if( option == COUNT )
+			{
+				HostBuffer<u32> lower( data->m_device, nDst );
+				HostBuffer<u32> upper( data->m_device, nDst );
+
+				for(u32 i=0; i<nDst; i++) { lower[i] = upper[i] = 0; }
+
+				execute( data, rawSrc, nSrc, lower, nDst, BOUND_LOWER );
+				execute( data, rawSrc, nSrc, upper, nDst, BOUND_UPPER );
+
+				for(u32 i=0; i<nDst; i++) { dst[i] = upper[i] - lower[i]; }
+			}
+			else
+			{
+				ADLASSERT( 0 );
+			}
+		}
+
+//		static
+//		void execute(Data* data, Buffer<u32>& src, Buffer<u32>& dst, int n, Option option = );
+};
+
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.cl
@@ -0,0 +1,112 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+
+typedef struct
+{
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+} ConstBuffer;
+
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nSrc )
+	{
+		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nSrc+1 )
+	{
+		SortData first; first.m_key = 0; first.m_value = 0;
+		SortData end; end.m_key = nDst; end.m_value = nDst;
+
+		SortData iData = (gIdx==0)? first: src[gIdx-1];
+		SortData jData = (gIdx==nSrc)? end: src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+__attribute__((reqd_work_group_size(64,1,1)))
+__kernel
+void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = cb.m_nSrc;
+	u32 nDst = cb.m_nDst;
+
+	if( gIdx < nDst )
+	{
+		C[gIdx] = A[gIdx] - B[gIdx];
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernels.hlsl
@@ -0,0 +1,104 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+
+
+typedef struct
+{
+	u32 m_key; 
+	u32 m_value;
+}SortData;
+
+
+
+cbuffer SortCB : register( b0 )
+{
+	u32 m_nSrc;
+	u32 m_nDst;
+	u32 m_padding[2];
+};
+
+
+StructuredBuffer<SortData> src : register( t0 );
+RWStructuredBuffer<u32> dst : register( u0 );
+
+
+[numthreads(64, 1, 1)]
+void SearchSortDataLowerKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = m_nSrc;
+	u32 nDst = m_nDst;
+
+	if( gIdx < nSrc )
+	{
+		SortData iData;
+		SortData jData;
+		if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;
+		else iData = src[gIdx-1];
+
+		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
+		else jData = src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)
+			u32 k = jData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
+[numthreads(64, 1, 1)]
+void SearchSortDataUpperKernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+	u32 nSrc = m_nSrc;
+	u32 nDst = m_nDst;
+
+	if( gIdx < nSrc+1 )
+	{
+		SortData iData;
+		SortData jData;
+		if( gIdx==0 ) iData.m_key = iData.m_value = 0;
+		else iData = src[gIdx-1];
+
+		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;
+		else jData = src[gIdx];
+
+		if( iData.m_key != jData.m_key )
+		{
+//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)
+			u32 k = iData.m_key;
+			{
+				dst[k] = gIdx;
+			}
+		}
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsCL.h
@@ -0,0 +1,102 @@
+static const char* boundSearchKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataLowerKernel(__global SortData* src, __global u32 *dst, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData first; first.m_key = (u32)(-1); first.m_value = (u32)(-1);\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SearchSortDataUpperKernel(__global SortData* src, __global u32 *dst, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData first; first.m_key = 0; first.m_value = 0;\n"
+"		SortData end; end.m_key = nDst; end.m_value = nDst;\n"
+"\n"
+"		SortData iData = (gIdx==0)? first: src[gIdx-1];\n"
+"		SortData jData = (gIdx==nSrc)? end: src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"__kernel\n"
+"void SubtractKernel(__global u32* A, __global u32 *B, __global u32 *C, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = cb.m_nSrc;\n"
+"	u32 nDst = cb.m_nDst;\n"
+"\n"
+"	if( gIdx < nDst )\n"
+"	{\n"
+"		C[gIdx] = A[gIdx] - B[gIdx];\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Search/BoundSearchKernelsDX11.h
@@ -0,0 +1,94 @@
+static const char* boundSearchKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	u32 m_key; \n"
+"	u32 m_value;\n"
+"}SortData;\n"
+"\n"
+"\n"
+"\n"
+"cbuffer SortCB : register( b0 )\n"
+"{\n"
+"	u32 m_nSrc;\n"
+"	u32 m_nDst;\n"
+"	u32 m_padding[2];\n"
+"};\n"
+"\n"
+"\n"
+"StructuredBuffer<SortData> src : register( t0 );\n"
+"RWStructuredBuffer<u32> dst : register( u0 );\n"
+"\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void SearchSortDataLowerKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = m_nSrc;\n"
+"	u32 nDst = m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc )\n"
+"	{\n"
+"		SortData iData;\n"
+"		SortData jData;\n"
+"		if( gIdx==0 ) iData.m_key = iData.m_value = (u32)-1;\n"
+"		else iData = src[gIdx-1];\n"
+"\n"
+"		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
+"		else jData = src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key+1; k<=min(jData.m_key, nDst-1); k++)\n"
+"			u32 k = jData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(64, 1, 1)]\n"
+"void SearchSortDataUpperKernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"	u32 nSrc = m_nSrc;\n"
+"	u32 nDst = m_nDst;\n"
+"\n"
+"	if( gIdx < nSrc+1 )\n"
+"	{\n"
+"		SortData iData;\n"
+"		SortData jData;\n"
+"		if( gIdx==0 ) iData.m_key = iData.m_value = 0;\n"
+"		else iData = src[gIdx-1];\n"
+"\n"
+"		if( gIdx==nSrc ) jData.m_key = jData.m_value = nDst;\n"
+"		else jData = src[gIdx];\n"
+"\n"
+"		if( iData.m_key != jData.m_key )\n"
+"		{\n"
+"//			for(u32 k=iData.m_key; k<min(jData.m_key, nDst); k++)\n"
+"			u32 k = iData.m_key;\n"
+"			{\n"
+"				dst[k] = gIdx;\n"
+"			}\n"
+"		}\n"
+"	}\n"
+"}\n"
+"\n"
+;