Add the GPU rigid body pipeline from https://github.com/erwincoumans/experiments as a Bullet 3.x preview for Bullet 2.80

2012-03-05 00:54:32 +00:00
parent 73c4646b40
commit 571af41cf6
257 changed files with 55106 additions and 0 deletions
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.h
@@ -0,0 +1,73 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+#pragma once
+
+#include <Adl/Adl.h>
+#include <AdlPrimitives/Math/Math.h>
+
+namespace adl
+{
+
+class CopyBase
+{
+	public:
+		enum Option
+		{
+			PER_WI_1, 
+			PER_WI_2, 
+			PER_WI_4, 
+		};
+};
+
+template<DeviceType TYPE>
+class Copy : public CopyBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+			const Device* m_device;
+			Kernel* m_copy1F4Kernel;
+			Kernel* m_copy2F4Kernel;
+			Kernel* m_copy4F4Kernel;
+			Kernel* m_copyF1Kernel;
+			Kernel* m_copyF2Kernel;
+			Buffer<int4>* m_constBuffer;
+		};
+
+		static
+		Data* allocate(const Device* deviceData);
+
+		static
+		void deallocate(Data* data);
+
+		static
+		void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1);
+
+		static
+		void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n);
+
+		static
+		void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n);
+};
+
+
+#include <AdlPrimitives/Copy/CopyHost.inl>
+#include <AdlPrimitives/Copy/Copy.inl>
+
+};
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/Copy.inl
@@ -0,0 +1,151 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+
+#define PATH "..\\..\\opencl\\primitives\\AdlPrimitives\\Copy\\CopyKernels"
+#define KERNEL0 "Copy1F4Kernel"
+#define KERNEL1 "Copy2F4Kernel"
+#define KERNEL2 "Copy4F4Kernel"
+#define KERNEL3 "CopyF1Kernel"
+#define KERNEL4 "CopyF2Kernel"
+
+#include <AdlPrimitives/Copy/CopyKernelsCL.h>
+#include <AdlPrimitives/Copy/CopyKernelsDX11.h>
+
+
+template<DeviceType TYPE>
+typename Copy<TYPE>::Data* Copy<TYPE>::allocate( const Device* device )
+{
+	ADLASSERT( TYPE == device->m_type );
+
+
+	const char* src[] = 
+#if defined(ADL_LOAD_KERNEL_FROM_STRING)
+	{copyKernelsCL, copyKernelsDX11};
+//	ADLASSERT(0);
+#else
+	{0,0};
+#endif	
+
+	Data* data = new Data;
+	data->m_device = device;
+	data->m_copy1F4Kernel = device->getKernel( PATH, KERNEL0, 0, src[TYPE] );
+	data->m_copy2F4Kernel = device->getKernel( PATH, KERNEL1, 0, src[TYPE] );
+	data->m_copy4F4Kernel = device->getKernel( PATH, KERNEL2, 0, src[TYPE] );
+	data->m_copyF1Kernel = device->getKernel( PATH, KERNEL3, 0, src[TYPE] );
+	data->m_copyF2Kernel = device->getKernel( PATH, KERNEL4, 0, src[TYPE] );
+	data->m_constBuffer = new Buffer<int4>( device, 1, BufferBase::BUFFER_CONST );
+
+	return data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::deallocate( Data* data )
+{
+	delete data->m_constBuffer;
+	delete data;
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	switch (option)
+	{
+	case PER_WI_1:
+		{
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy1F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/1 );
+		}
+		break;
+	case PER_WI_2:
+		{
+			ADLASSERT( n%2 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy2F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/2 );
+		}
+		break;
+	case PER_WI_4:
+		{
+			ADLASSERT( n%4 == 0 );
+			BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+			Launcher launcher( data->m_device, data->m_copy4F4Kernel );
+			launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+			launcher.setConst( *data->m_constBuffer, constBuffer );
+			launcher.launch1D( n/4 );
+		}
+		break;
+	default:
+		ADLASSERT(0);
+		break;
+	};
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF2Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+template<DeviceType TYPE>
+void Copy<TYPE>::execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n )
+{
+	ADLASSERT( TYPE == dst.getType() );
+	ADLASSERT( TYPE == src.getType() );
+
+	int4 constBuffer;
+	constBuffer.x = n;
+
+	BufferInfo bInfo[] = { BufferInfo( &dst ), BufferInfo( &src, true ) };
+
+	Launcher launcher( data->m_device, data->m_copyF1Kernel );
+	launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) );
+	launcher.setConst( *data->m_constBuffer, constBuffer );
+	launcher.launch1D( n/1 );
+}
+
+
+#undef PATH
+#undef KERNEL0
+#undef KERNEL1
+#undef KERNEL2
+#undef KERNEL3
+#undef KERNEL4
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyHost.inl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyHost.inl
@@ -0,0 +1,85 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+template<>
+class Copy<TYPE_HOST> : public CopyBase
+{
+	public:
+		typedef Launcher::BufferInfo BufferInfo;
+
+		struct Data
+		{
+		};
+
+		static
+		Data* allocate(const Device* deviceData)
+		{
+			ADLASSERT( TYPE_HOST == deviceData->m_type );
+			return 0;
+		}
+
+		static
+		void deallocate(Data* data)
+		{
+			return;
+		}
+
+		static
+		void execute( Data* data, Buffer<float4>& dst, Buffer<float4>& src, int n, Option option = PER_WI_1)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float4>& dstH = (HostBuffer<float4>&)dst;
+			HostBuffer<float4>& srcH = (HostBuffer<float4>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+
+		static
+		void execute( Data* data, Buffer<float2>& dst, Buffer<float2>& src, int n)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float2>& dstH = (HostBuffer<float2>&)dst;
+			HostBuffer<float2>& srcH = (HostBuffer<float2>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+
+		static
+		void execute( Data* data, Buffer<float>& dst, Buffer<float>& src, int n)
+		{
+			ADLASSERT( TYPE_HOST == dst.getType() );
+			ADLASSERT( TYPE_HOST == src.getType() );
+
+			HostBuffer<float>& dstH = (HostBuffer<float>&)dst;
+			HostBuffer<float>& srcH = (HostBuffer<float>&)src;
+
+			for(int i=0; i<n; i++)
+			{
+				dstH[i] = srcH[i];
+			}
+		}
+};
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.cl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.cl
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+#pragma OPENCL EXTENSION cl_amd_printf : enable
+#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
+
+typedef unsigned int u32;
+#define GET_GROUP_IDX get_group_id(0)
+#define GET_LOCAL_IDX get_local_id(0)
+#define GET_GLOBAL_IDX get_global_id(0)
+#define GET_GROUP_SIZE get_local_size(0)
+#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)
+#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)
+#define AtomInc(x) atom_inc(&(x))
+#define AtomInc1(x, out) out = atom_inc(&(x))
+
+#define make_uint4 (uint4)
+#define make_uint2 (uint2)
+#define make_int2 (int2)
+
+typedef struct
+{
+	int m_n;
+	int m_padding[3];
+} ConstBuffer;
+
+
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy1F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float4 a0 = src[gIdx];
+
+		dst[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy2F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 2*gIdx <= cb.m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void Copy4F4Kernel(__global float4* dst, __global float4* src, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 4*gIdx <= cb.m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF1Kernel(__global float* dstF1, __global float* srcF1, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float a0 = srcF1[gIdx];
+
+		dstF1[ gIdx ] = a0;
+	}
+}
+
+__kernel
+__attribute__((reqd_work_group_size(64,1,1)))
+void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, 
+					ConstBuffer cb)
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < cb.m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+
+		dstF2[ gIdx ] = a0;
+	}
+}
+
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.hlsl
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernels.hlsl
@@ -0,0 +1,130 @@
+/*
+Copyright (c) 2012 Advanced Micro Devices, Inc.  
+
+This software is provided 'as-is', without any express or implied warranty.
+In no event will the authors be held liable for any damages arising from the use of this software.
+Permission is granted to anyone to use this software for any purpose, 
+including commercial applications, and to alter it and redistribute it freely, 
+subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+*/
+//Originally written by Takahiro Harada
+
+
+typedef uint u32;
+
+#define GET_GROUP_IDX groupIdx.x
+#define GET_LOCAL_IDX localIdx.x
+#define GET_GLOBAL_IDX globalIdx.x
+#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()
+#define GROUP_MEM_FENCE
+#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID
+#define AtomInc(x) InterlockedAdd(x, 1)
+#define AtomInc1(x, out) InterlockedAdd(x, 1, out)
+
+#define make_uint4 uint4
+#define make_uint2 uint2
+#define make_int2 int2
+
+#define WG_SIZE 64
+
+#define GET_GROUP_SIZE WG_SIZE
+
+
+
+cbuffer CB : register( b0 )
+{
+	int m_n;
+	int m_padding[3];
+};
+
+RWStructuredBuffer<float4> dst : register( u0 );
+StructuredBuffer<float4> src : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy1F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float4 a0 = src[gIdx];
+
+		dst[ gIdx ] = a0;
+	}
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy2F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 2*gIdx <= m_n )
+	{
+		float4 a0 = src[gIdx*2+0];
+		float4 a1 = src[gIdx*2+1];
+
+		dst[ gIdx*2+0 ] = a0;
+		dst[ gIdx*2+1 ] = a1;
+	}
+}
+
+[numthreads(WG_SIZE, 1, 1)]
+void Copy4F4Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( 4*gIdx <= m_n )
+	{
+		int idx0 = gIdx*4+0;
+		int idx1 = gIdx*4+1;
+		int idx2 = gIdx*4+2;
+		int idx3 = gIdx*4+3;
+
+		float4 a0 = src[idx0];
+		float4 a1 = src[idx1];
+		float4 a2 = src[idx2];
+		float4 a3 = src[idx3];
+
+		dst[ idx0 ] = a0;
+		dst[ idx1 ] = a1;
+		dst[ idx2 ] = a2;
+		dst[ idx3 ] = a3;
+	}
+}
+
+RWStructuredBuffer<float> dstF1 : register( u0 );
+StructuredBuffer<float> srcF1 : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyF1Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float a0 = srcF1[gIdx];
+
+		dstF1[ gIdx ] = a0;
+	}
+
+}
+
+RWStructuredBuffer<float2> dstF2 : register( u0 );
+StructuredBuffer<float2> srcF2 : register( t0 );
+
+[numthreads(WG_SIZE, 1, 1)]
+void CopyF2Kernel( DEFAULT_ARGS )
+{
+	int gIdx = GET_GLOBAL_IDX;
+
+	if( gIdx < m_n )
+	{
+		float2 a0 = srcF2[gIdx];
+
+		dstF2[ gIdx ] = a0;
+	}
+}
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsCL.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsCL.h
@@ -0,0 +1,119 @@
+static const char* copyKernelsCL= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"#pragma OPENCL EXTENSION cl_amd_printf : enable\n"
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
+"\n"
+"typedef unsigned int u32;\n"
+"#define GET_GROUP_IDX get_group_id(0)\n"
+"#define GET_LOCAL_IDX get_local_id(0)\n"
+"#define GET_GLOBAL_IDX get_global_id(0)\n"
+"#define GET_GROUP_SIZE get_local_size(0)\n"
+"#define GROUP_LDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE)\n"
+"#define GROUP_MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE)\n"
+"#define AtomInc(x) atom_inc(&(x))\n"
+"#define AtomInc1(x, out) out = atom_inc(&(x))\n"
+"\n"
+"#define make_uint4 (uint4)\n"
+"#define make_uint2 (uint2)\n"
+"#define make_int2 (int2)\n"
+"\n"
+"typedef struct\n"
+"{\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"} ConstBuffer;\n"
+"\n"
+"\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy1F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy2F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 2*gIdx <= cb.m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void Copy4F4Kernel(__global float4* dst, __global float4* src, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 4*gIdx <= cb.m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF1Kernel(__global float* dstF1, __global float* srcF1, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"__kernel\n"
+"__attribute__((reqd_work_group_size(64,1,1)))\n"
+"void CopyF2Kernel(__global float2* dstF2, __global float2* srcF2, \n"
+"					ConstBuffer cb)\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < cb.m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+;
--- a/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsDX11.h
+++ b/Extras/RigidBodyGpuPipeline/opencl/primitives/AdlPrimitives/Copy/CopyKernelsDX11.h
@@ -0,0 +1,120 @@
+static const char* copyKernelsDX11= \
+"/*\n"
+"		2011 Takahiro Harada\n"
+"*/\n"
+"\n"
+"typedef uint u32;\n"
+"\n"
+"#define GET_GROUP_IDX groupIdx.x\n"
+"#define GET_LOCAL_IDX localIdx.x\n"
+"#define GET_GLOBAL_IDX globalIdx.x\n"
+"#define GROUP_LDS_BARRIER GroupMemoryBarrierWithGroupSync()\n"
+"#define GROUP_MEM_FENCE\n"
+"#define DEFAULT_ARGS uint3 globalIdx : SV_DispatchThreadID, uint3 localIdx : SV_GroupThreadID, uint3 groupIdx : SV_GroupID\n"
+"#define AtomInc(x) InterlockedAdd(x, 1)\n"
+"#define AtomInc1(x, out) InterlockedAdd(x, 1, out)\n"
+"\n"
+"#define make_uint4 uint4\n"
+"#define make_uint2 uint2\n"
+"#define make_int2 int2\n"
+"\n"
+"#define WG_SIZE 64\n"
+"\n"
+"#define GET_GROUP_SIZE WG_SIZE\n"
+"\n"
+"\n"
+"\n"
+"cbuffer CB : register( b0 )\n"
+"{\n"
+"	int m_n;\n"
+"	int m_padding[3];\n"
+"};\n"
+"\n"
+"RWStructuredBuffer<float4> dst : register( u0 );\n"
+"StructuredBuffer<float4> src : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy1F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx];\n"
+"\n"
+"		dst[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy2F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 2*gIdx <= m_n )\n"
+"	{\n"
+"		float4 a0 = src[gIdx*2+0];\n"
+"		float4 a1 = src[gIdx*2+1];\n"
+"\n"
+"		dst[ gIdx*2+0 ] = a0;\n"
+"		dst[ gIdx*2+1 ] = a1;\n"
+"	}\n"
+"}\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void Copy4F4Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( 4*gIdx <= m_n )\n"
+"	{\n"
+"		int idx0 = gIdx*4+0;\n"
+"		int idx1 = gIdx*4+1;\n"
+"		int idx2 = gIdx*4+2;\n"
+"		int idx3 = gIdx*4+3;\n"
+"\n"
+"		float4 a0 = src[idx0];\n"
+"		float4 a1 = src[idx1];\n"
+"		float4 a2 = src[idx2];\n"
+"		float4 a3 = src[idx3];\n"
+"\n"
+"		dst[ idx0 ] = a0;\n"
+"		dst[ idx1 ] = a1;\n"
+"		dst[ idx2 ] = a2;\n"
+"		dst[ idx3 ] = a3;\n"
+"	}\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<float> dstF1 : register( u0 );\n"
+"StructuredBuffer<float> srcF1 : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyF1Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float a0 = srcF1[gIdx];\n"
+"\n"
+"		dstF1[ gIdx ] = a0;\n"
+"	}\n"
+"\n"
+"}\n"
+"\n"
+"RWStructuredBuffer<float2> dstF2 : register( u0 );\n"
+"StructuredBuffer<float2> srcF2 : register( t0 );\n"
+"\n"
+"[numthreads(WG_SIZE, 1, 1)]\n"
+"void CopyF2Kernel( DEFAULT_ARGS )\n"
+"{\n"
+"	int gIdx = GET_GLOBAL_IDX;\n"
+"\n"
+"	if( gIdx < m_n )\n"
+"	{\n"
+"		float2 a0 = srcF2[gIdx];\n"
+"\n"
+"		dstF2[ gIdx ] = a0;\n"
+"	}\n"
+"}\n"
+;